From 748e8325fd0b2e09469c76f584b8e08c1ef03ca6 Mon Sep 17 00:00:00 2001 From: David Teigland Date: Nov 15 2022 16:11:30 +0000 Subject: watchdog timeout configuration Make the watchdog timeout configurable. The watchdog device must support the configured value, and all hosts must use the same value. The io_timeout should usually be configured by a similar factor. Set watchdog_fire_timeout and io_timeout in sanlock.conf, e.g. watchdog_fire_timeout=30 io_timeout=5 The defaults remain watchdog_fire_timeout 60 and io_timeout 10. wdmd --trytimeout can be used test if the watchdog device supports a certain timeout value. --- diff --git a/src/cmd.c b/src/cmd.c index 5ab0ae2..ae57bf3 100644 --- a/src/cmd.c +++ b/src/cmd.c @@ -1326,7 +1326,7 @@ static void cmd_add_lockspace(struct cmd_args *ca, uint32_t cmd) io_timeout = ca->header.data; if (!io_timeout) - io_timeout = DEFAULT_IO_TIMEOUT; + io_timeout = com.io_timeout; rv = add_lockspace_start(&lockspace, io_timeout, &sp); if (rv < 0) { @@ -1577,7 +1577,7 @@ static void cmd_read_lockspace(struct task *task, struct cmd_args *ca, uint32_t if (!sector_size) { /* reads the first leader record to get sector size */ - result = delta_read_lockspace_sizes(task, &sd, DEFAULT_IO_TIMEOUT, §or_size, &align_size); + result = delta_read_lockspace_sizes(task, &sd, com.io_timeout, §or_size, &align_size); if (result < 0) goto out_close; if ((sector_size != 512) && (sector_size != 4096)) { @@ -1588,7 +1588,7 @@ static void cmd_read_lockspace(struct task *task, struct cmd_args *ca, uint32_t /* sets ls->name and io_timeout */ result = delta_read_lockspace(task, &sd, sector_size, align_size, host_id, &lockspace, - DEFAULT_IO_TIMEOUT, &io_timeout); + com.io_timeout, &io_timeout); if (result == SANLK_OK) result = 0; @@ -1677,7 +1677,7 @@ static void cmd_read_resource(struct task *task, struct cmd_args *ca, uint32_t c goto reply; } - token->io_timeout = DEFAULT_IO_TIMEOUT; + token->io_timeout = com.io_timeout; /* * These may be zero, in which case paxos_read_resource reads a 4K sector @@ -1778,7 +1778,7 @@ static void cmd_read_resource_owners(struct task *task, struct cmd_args *ca, uin goto reply; } - token->io_timeout = DEFAULT_IO_TIMEOUT; + token->io_timeout = com.io_timeout; /* * These may be zero, in which case paxos_read_resource reads a 4K sector @@ -1820,7 +1820,7 @@ static void cmd_write_lockspace(struct task *task, struct cmd_args *ca, uint32_t struct sanlk_lockspace lockspace; struct sync_disk sd; int fd, rv, result; - int io_timeout = DEFAULT_IO_TIMEOUT; + int io_timeout = com.io_timeout; fd = client[ca->ci_in].fd; @@ -1953,7 +1953,7 @@ static void cmd_write_resource(struct task *task, struct cmd_args *ca, uint32_t goto reply; } - token->io_timeout = DEFAULT_IO_TIMEOUT; + token->io_timeout = com.io_timeout; result = paxos_lease_init(task, token, num_hosts, write_clear); @@ -2302,6 +2302,8 @@ static int print_state_daemon(char *str) "max_worker_threads=%d " "write_init_io_timeout=%u " "use_aio=%d " + "io_timeout=%d " + "watchdog_fire_timeout=%d " "kill_grace_seconds=%d " "helper_pid=%d " "helper_kill_fd=%d " @@ -2330,7 +2332,9 @@ static int print_state_daemon(char *str) com.max_worker_threads, com.write_init_io_timeout, main_task.use_aio, - kill_grace_seconds, + com.io_timeout, + com.watchdog_fire_timeout, + com.kill_grace_seconds, helper_pid, helper_kill_fd, helper_full_count, diff --git a/src/delta_lease.c b/src/delta_lease.c index 9a8fc22..9d5aafd 100644 --- a/src/delta_lease.c +++ b/src/delta_lease.c @@ -849,7 +849,7 @@ int delta_lease_init(struct task *task, uint32_t checksum; if (!io_timeout) - io_timeout = DEFAULT_IO_TIMEOUT; + io_timeout = com.io_timeout; rv = sizes_from_flags(ls->flags, §or_size, &align_size, &max_hosts, "LSF"); if (rv) diff --git a/src/direct.c b/src/direct.c index 661c4e8..7e00c52 100644 --- a/src/direct.c +++ b/src/direct.c @@ -55,7 +55,7 @@ static int direct_read_leader_sizes(struct task *task, struct sync_disk *sd, memset(data, 0, datalen); - rv = read_sectors(sd, 4096, 0, 1, data, datalen, task, DEFAULT_IO_TIMEOUT, "read_sector_size"); + rv = read_sectors(sd, 4096, 0, 1, data, datalen, task, com.io_timeout, "read_sector_size"); if (rv < 0) { free(data); return rv; @@ -134,7 +134,7 @@ static int do_paxos_action(int action, struct task *task, int io_timeout, struct int j, rv = 0; if (!io_timeout) - io_timeout = DEFAULT_IO_TIMEOUT; + io_timeout = com.io_timeout; rv = sizes_from_flags(res->flags, §or_size, &align_size, &max_hosts, "RES"); if (rv) @@ -309,7 +309,7 @@ static int do_delta_action(int action, memset(bitmap, 0, sizeof(bitmap)); if (!io_timeout) - io_timeout = DEFAULT_IO_TIMEOUT; + io_timeout = com.io_timeout; rv = sizes_from_flags(ls->flags, §or_size, &align_size, &max_hosts, "LSF"); if (rv) @@ -706,7 +706,7 @@ int direct_dump(struct task *task, char *dump_path, int force_mode) memset(data, 0, sector_size); rv = read_sectors(&sd, sector_size, sector_nr, sector_count, data, datalen, - task, DEFAULT_IO_TIMEOUT, "dump"); + task, com.io_timeout, "dump"); magic_in(data, &magic); @@ -906,7 +906,7 @@ int direct_next_free(struct task *task, char *path) memset(data, 0, sector_size); rv = read_sectors(&sd, sector_size, sector_nr, 1, data, datalen, - task, DEFAULT_IO_TIMEOUT, "next_free"); + task, com.io_timeout, "next_free"); lr_end = (struct leader_record *)data; diff --git a/src/lockspace.c b/src/lockspace.c index 3deb2b0..a874148 100644 --- a/src/lockspace.c +++ b/src/lockspace.c @@ -842,6 +842,27 @@ static void *lockspace_thread(void *arg_in) } /* + * Tell wdmd to open the watchdog device, set the fire timeout and + * begin the keepalive loop that regularly pets the watchdog. This + * only happens for the first client/lockspace. This fails if the + * watchdog device cannot be opened by wdmd or does not support the + * requested fire timeout. + * + * For later clients/lockspaces, when wdmd already has the watchdog + * open, this does nothing (just verifies that fire timeout matches + * what's in use.) + */ + rv = open_watchdog(wd_con, com.watchdog_fire_timeout); + if (rv < 0) { + log_erros(sp, "open_watchdog with fire_timeout %d failed %d", + com.watchdog_fire_timeout, wd_con); + acquire_result = SANLK_WD_ERROR; + delta_result = -1; + disconnect_watchdog(sp); + goto set_status; + } + + /* * acquire the delta lease */ @@ -989,7 +1010,7 @@ static void *lockspace_thread(void *arg_in) /* watchdog unlink was done in main_loop when thread_stop was set, to get it done as quickly as possible in case the wd is about to fire. */ - close_watchdog(sp); + disconnect_watchdog(sp); out: if (delta_result == SANLK_OK) delta_lease_release(&task, sp, &sp->host_id_disk, diff --git a/src/main.c b/src/main.c index 4c022de..a9ceee5 100644 --- a/src/main.c +++ b/src/main.c @@ -658,11 +658,11 @@ static void kill_pids(struct space *sp) * kill_grace_seconds */ - in_grace = now < (last_success + id_renewal_fail_seconds + kill_grace_seconds); + in_grace = now < (last_success + id_renewal_fail_seconds + com.kill_grace_seconds); if (sp->external_remove || (external_shutdown > 1)) { sig = SIGKILL; - } else if ((kill_grace_seconds > 0) && in_grace && cl->killpath[0]) { + } else if ((com.kill_grace_seconds > 0) && in_grace && cl->killpath[0]) { sig = SIGRUNPATH; } else if (in_grace) { sig = SIGTERM; @@ -1736,6 +1736,7 @@ static int do_daemon(void) } setup_limits(); + setup_timeouts(); setup_helper(); /* main task never does disk io, so we don't really need to set @@ -1780,7 +1781,12 @@ static int do_daemon(void) uname(&nodename); - log_warn("sanlock daemon started %s host %s (%s)", VERSION, our_host_name_global, nodename.nodename); + if (com.io_timeout != DEFAULT_IO_TIMEOUT || com.watchdog_fire_timeout != DEFAULT_WATCHDOG_FIRE_TIMEOUT) + log_warn("sanlock daemon started %s host %s (%s) io_timeout %u watchdog_fire_timeout %u", + VERSION, our_host_name_global, nodename.nodename, com.io_timeout, com.watchdog_fire_timeout); + else + log_warn("sanlock daemon started %s host %s (%s)", + VERSION, our_host_name_global, nodename.nodename); setup_priority(); @@ -2110,6 +2116,7 @@ static void print_usage(void) printf(" -G group id\n"); printf(" -t max worker threads (%d)\n", DEFAULT_MAX_WORKER_THREADS); printf(" -g seconds for graceful recovery (%d)\n", DEFAULT_GRACE_SEC); + printf(" -o io timeout (%d)\n", DEFAULT_IO_TIMEOUT); printf(" -w 0|1 use watchdog through wdmd (%d)\n", DEFAULT_USE_WATCHDOG); printf(" -h 0|1 use high priority (RR) scheduling (%d)\n", DEFAULT_HIGH_PRIORITY); printf(" -l use mlockall (0 none, 1 current, 2 current and future) (%d)\n", DEFAULT_MLOCK_LEVEL); @@ -2189,7 +2196,7 @@ static int read_command_line(int argc, char *argv[]) char *p; char *arg1 = argv[1]; char *act; - int i, j, len, sec, begin_command = 0; + int i, j, len, sec, val, begin_command = 0; if (argc < 2 || !strcmp(arg1, "help") || !strcmp(arg1, "--help") || !strcmp(arg1, "-h")) { @@ -2432,9 +2439,9 @@ static int read_command_line(int argc, char *argv[]) if (com.action == ACT_STATUS) { com.sort_arg = *optionarg; } else { - com.io_timeout_arg = atoi(optionarg); - if (!com.io_timeout_arg) - com.io_timeout_arg = DEFAULT_IO_TIMEOUT; + val = atoi(optionarg); + if (val > 0) + com.io_timeout = val; } break; case 'b': @@ -2466,8 +2473,10 @@ static int read_command_line(int argc, char *argv[]) case 'g': if (com.type == COM_DAEMON) { sec = atoi(optionarg); - if (sec <= 60 && sec >= 0) - kill_grace_seconds = sec; + if (sec <= 60 && sec >= 0) { + com.kill_grace_seconds = sec; + com.kill_grace_set = 1; + } } else { com.host_generation = strtoull(optionarg, NULL, 0); } @@ -2806,6 +2815,23 @@ static void read_config_file(void) get_val_int(line, &val); com.use_watchdog = val; + } else if (!strcmp(str, "io_timeout")) { + get_val_int(line, &val); + if (val > 0) + com.io_timeout = val; + + } else if (!strcmp(str, "watchdog_fire_timeout")) { + get_val_int(line, &val); + if (val > 0) + com.watchdog_fire_timeout = val; + + } else if (!strcmp(str, "kill_grace_seconds")) { + get_val_int(line, &val); + if (val <= 60 && val >= 0) { + com.kill_grace_seconds = val; + com.kill_grace_set = 1; + } + } else if (!strcmp(str, "high_priority")) { get_val_int(line, &val); com.high_priority = val; @@ -3215,10 +3241,10 @@ static int do_client(void) break; case ACT_ADD_LOCKSPACE: - if (com.io_timeout_arg != DEFAULT_IO_TIMEOUT) { - log_tool("add_lockspace_timeout %d", com.io_timeout_arg); + if (com.io_timeout != DEFAULT_IO_TIMEOUT) { + log_tool("add_lockspace_timeout %d", com.io_timeout); rv = sanlock_add_lockspace_timeout(&com.lockspace, 0, - com.io_timeout_arg); + com.io_timeout); log_tool("add_lockspace_timeout done %d", rv); } else { log_tool("add_lockspace"); @@ -3343,7 +3369,7 @@ static int do_client(void) rv = sanlock_write_lockspace(&com.lockspace, com.max_hosts, 0, - com.io_timeout_arg); + com.io_timeout); } else { if (com.sector_size) com.res_args[0]->flags |= sanlk_res_sector_size_to_flag(com.sector_size); @@ -3595,7 +3621,7 @@ static int do_direct_read_leader(void) struct leader_record leader; int rv; - rv = direct_read_leader(&main_task, com.io_timeout_arg, + rv = direct_read_leader(&main_task, com.io_timeout, &com.lockspace, com.res_args[0], &leader); @@ -3620,7 +3646,7 @@ static int do_direct_write_leader(void) memset(&leader, 0, sizeof(leader)); - direct_read_leader(&main_task, com.io_timeout_arg, + direct_read_leader(&main_task, com.io_timeout, &com.lockspace, com.res_args[0], &leader); @@ -3643,7 +3669,7 @@ static int do_direct_write_leader(void) syslog(LOG_WARNING, "write_leader resource %s", res_str); } - rv = direct_write_leader(&main_task, com.io_timeout_arg, + rv = direct_write_leader(&main_task, com.io_timeout, &com.lockspace, com.res_args[0], &leader); out: @@ -3676,8 +3702,7 @@ static int do_direct_init(void) (unsigned long long)com.lockspace.host_id_disk.offset, com.lockspace.flags); - rv = direct_write_lockspace(&main_task, &com.lockspace, - com.io_timeout_arg); + rv = direct_write_lockspace(&main_task, &com.lockspace, com.io_timeout); } else if (com.res_args[0]) { if (com.sector_size) com.res_args[0]->flags |= sanlk_res_sector_size_to_flag(com.sector_size); @@ -3783,7 +3808,7 @@ static int do_direct(void) case ACT_ACQUIRE: syslog(LOG_WARNING, "acquire"); - rv = direct_acquire(&main_task, com.io_timeout_arg, + rv = direct_acquire(&main_task, com.io_timeout, com.res_args[0], com.num_hosts, com.host_id, com.host_generation, &leader); @@ -3792,7 +3817,7 @@ static int do_direct(void) case ACT_RELEASE: syslog(LOG_WARNING, "release"); - rv = direct_release(&main_task, com.io_timeout_arg, + rv = direct_release(&main_task, com.io_timeout, com.res_args[0], &leader); log_tool("release done %d", rv); break; @@ -3801,20 +3826,20 @@ static int do_direct(void) syslog(LOG_WARNING, "acquire_id"); setup_host_name(); - rv = direct_acquire_id(&main_task, com.io_timeout_arg, + rv = direct_acquire_id(&main_task, com.io_timeout, &com.lockspace, our_host_name_global); log_tool("acquire_id done %d", rv); break; case ACT_RELEASE_ID: syslog(LOG_WARNING, "release_id"); - rv = direct_release_id(&main_task, com.io_timeout_arg, &com.lockspace); + rv = direct_release_id(&main_task, com.io_timeout, &com.lockspace); log_tool("release_id done %d", rv); break; case ACT_RENEW_ID: syslog(LOG_WARNING, "renew_id"); - rv = direct_renew_id(&main_task, com.io_timeout_arg, &com.lockspace); + rv = direct_renew_id(&main_task, com.io_timeout, &com.lockspace); log_tool("rewew_id done %d", rv); break; @@ -3877,7 +3902,6 @@ int main(int argc, char *argv[]) set_sanlock_version(); kill_count_max = 100; - kill_grace_seconds = DEFAULT_GRACE_SEC; helper_ci = -1; helper_pid = -1; helper_kill_fd = -1; @@ -3890,11 +3914,13 @@ int main(int argc, char *argv[]) memset(&com, 0, sizeof(com)); com.use_watchdog = DEFAULT_USE_WATCHDOG; + com.watchdog_fire_timeout = DEFAULT_WATCHDOG_FIRE_TIMEOUT; + com.kill_grace_seconds = DEFAULT_GRACE_SEC; com.high_priority = DEFAULT_HIGH_PRIORITY; com.mlock_level = DEFAULT_MLOCK_LEVEL; com.names_log_priority = LOG_WARNING; com.max_worker_threads = DEFAULT_MAX_WORKER_THREADS; - com.io_timeout_arg = DEFAULT_IO_TIMEOUT; + com.io_timeout = DEFAULT_IO_TIMEOUT; com.write_init_io_timeout = DEFAULT_WRITE_INIT_IO_TIMEOUT; com.aio_arg = DEFAULT_USE_AIO; com.pid = -1; diff --git a/src/rindex.c b/src/rindex.c index 7ee4e54..9ef02da 100644 --- a/src/rindex.c +++ b/src/rindex.c @@ -64,7 +64,7 @@ static struct token *setup_rindex_token(struct rindex_info *rx, strcpy(token->r.name, "rindex_lease"); token->sector_size = sector_size; token->align_size = align_size; - token->io_timeout = spi ? spi->io_timeout : DEFAULT_IO_TIMEOUT; + token->io_timeout = spi ? spi->io_timeout : com.io_timeout; token->r.num_disks = 1; token->r.flags |= sanlk_res_sector_size_to_flag(sector_size); token->r.flags |= sanlk_res_align_size_to_flag(align_size); @@ -105,7 +105,7 @@ static struct token *setup_resource_token(struct rindex_info *rx, memcpy(token->r.name, res_name, SANLK_NAME_LEN); token->sector_size = sector_size; token->align_size = align_size; - token->io_timeout = spi ? spi->io_timeout : DEFAULT_IO_TIMEOUT; + token->io_timeout = spi ? spi->io_timeout : com.io_timeout; token->r.num_disks = 1; token->r.flags |= sanlk_res_sector_size_to_flag(sector_size); token->r.flags |= sanlk_res_align_size_to_flag(align_size); @@ -291,7 +291,7 @@ static int read_rindex_header(struct task *task, if (!sector_size) sector_size = 4096; if (!io_timeout) { - io_timeout = DEFAULT_IO_TIMEOUT; + io_timeout = com.io_timeout; spi->io_timeout = io_timeout; } @@ -447,7 +447,7 @@ int rindex_format(struct task *task, struct sanlk_rindex *ri) if (com.write_init_io_timeout) write_io_timeout = com.write_init_io_timeout; else - write_io_timeout = DEFAULT_IO_TIMEOUT; + write_io_timeout = com.io_timeout; rv = write_iobuf(rx.disk->fd, rx.disk->offset, iobuf, iobuf_len, task, write_io_timeout, NULL); if (rv < 0) { diff --git a/src/sanlock.8 b/src/sanlock.8 index 2021b4a..0776b8f 100644 --- a/src/sanlock.8 +++ b/src/sanlock.8 @@ -599,6 +599,9 @@ seconds for graceful recovery .BR -w " 0|1" use watchdog through wdmd +.BI -o " sec" +io timeout + .BR -h " 0|1" use high priority (RR) scheduling @@ -1071,6 +1074,34 @@ the full history of renewals saved by sanlock, which by default is 180 records, about 1 hour of history when using a 20 second renewal interval for a 10 second io timeout. +.P + +.SS Configurable watchdog timeout + +Watchdog devices usually have a 60 second timeout, but some devices +have a configurable timeout. To use a different watchdog timeout, set +sanlock.conf watchdog_fire_timeout (in seconds) to a value supported by +the device. The same watchdog_fire_timeout must be configured on all +hosts (so all hosts must have watchdog devices that support the same +timeout). Unmatching values will invalidate the lease protection provided +by the watchdog. + +watchdog_fire_timeout and io_timeout should usually be configured +together. By default, sanlock uses watchdog_fire_timeout=60 with +io_timeout=10. Other combinations to consider are: +.br +watchdog_fire_timeout=30 with io_timeout=5 +.br +watchdog_fire_timeout=10 with io_timeout=2 + +Smaller values make it more likely that a host will be reset by the +watchdog while waiting for slow io to complete or for temporary io +failures to be resolved. Spurious watchdog resets will also become +more likely due to independent, overlapping lockspace outages, each +of which would be inconsequential by itself. + +.P + .SH INTERNALS .SS Disk Format @@ -1377,6 +1408,23 @@ max_worker_threads = .br See -t +.IP \[bu] 2 +io_timeout = +.br +The io timeout for disk operations, most notably delta lease renewals. +This value is basis for calculating most other timeout values. (Some +special cases may use a different io timeout.) Tune this value with +caution, it can substantially alter the overall sanlock behavior. + +.IP \[bu] 2 +watchdog_fire_timeout = +.br +The watchdog device timeout. The watchdog device must support the +specified value. It is critical that all hosts use the same value. +Not doing so will invalidate the lease protection provided by sanlock. +The io_timeout should usually be tuned along with this value, e.g. +watchdog_fire_timeout = 30 with io_timeout = 5. + .SH SEE ALSO .BR wdmd (8) diff --git a/src/sanlock.conf b/src/sanlock.conf index 2909a9c..89899f1 100644 --- a/src/sanlock.conf +++ b/src/sanlock.conf @@ -69,3 +69,9 @@ # # max_worker_threads = 8 # command line: -t 8 +# +# io_timeout = 10 +# command line: -o +# +# watchdog_fire_timeout = 60 +# command line: n/a diff --git a/src/sanlock_internal.h b/src/sanlock_internal.h index 7fe0a5b..d8c027f 100644 --- a/src/sanlock_internal.h +++ b/src/sanlock_internal.h @@ -315,7 +315,7 @@ struct client { EXTERN struct client *client; -#define WATCHDOG_FIRE_TIMEOUT 60 +#define DEFAULT_WATCHDOG_FIRE_TIMEOUT 60 #define DEFAULT_USE_AIO 1 #define DEFAULT_IO_TIMEOUT 10 #define DEFAULT_GRACE_SEC 40 @@ -352,13 +352,16 @@ struct command_line { int quiet_fail; int wait; int use_watchdog; + int watchdog_fire_timeout; + int io_timeout; /* DEFAULT_IO_TIMEOUT or sanlock.conf io_timeout */ + int kill_grace_seconds; /* -g */ + int kill_grace_set; int high_priority; /* -h */ int get_hosts; /* -h */ int names_log_priority; int mlock_level; int max_worker_threads; int aio_arg; - int io_timeout_arg; int write_init_io_timeout; int set_bitmap_seconds; int persistent; @@ -455,7 +458,6 @@ EXTERN int external_shutdown; EXTERN char our_host_name_global[SANLK_NAME_LEN+1]; EXTERN int kill_count_max; -EXTERN int kill_grace_seconds; EXTERN int is_helper; EXTERN int helper_ci; EXTERN int helper_pid; diff --git a/src/task.c b/src/task.c index f3c10f8..ad2e761 100644 --- a/src/task.c +++ b/src/task.c @@ -76,7 +76,7 @@ void close_task_aio(struct task *task) goto skip_aio; memset(&ts, 0, sizeof(struct timespec)); - ts.tv_sec = DEFAULT_IO_TIMEOUT; + ts.tv_sec = com.io_timeout; last_warn = time(NULL); begin = last_warn; @@ -87,7 +87,7 @@ void close_task_aio(struct task *task) while (1) { now = time(NULL); - if (now - last_warn >= (DEFAULT_IO_TIMEOUT * 6)) { + if (now - last_warn >= (com.io_timeout * 6)) { last_warn = now; lvl = LOG_ERR; } else { diff --git a/src/timeouts.c b/src/timeouts.c index 6d78b55..8884518 100644 --- a/src/timeouts.c +++ b/src/timeouts.c @@ -27,10 +27,50 @@ #include "task.h" #include "timeouts.h" +void setup_timeouts(void) +{ + /* + * graceful shutdown is client pids stopping their activity and + * releasing their sanlock leases in response to a killpath program + * they configured, or in response to sigterm from sanlock if they + * did not set a killpath program. It's an opportunity for the client + * pid to exit more gracefully than getting sigkill. If the client + * pid does not release leases in response to the killpath/sigterm, + * then eventually sanlock will escalate and send a sigkill. + * + * It's hard to know what portion of recovery time should be allocated + * to graceful shutdown before escalating to sigkill. The smaller the + * watchdog timeout, the less time between entering recovery mode and + * the watchdog potentially firing. 10 seconds before the watchdog + * will fire, the idea is to give up on graceful shutdown and resort + * to sending sigkill to any client pids that have not released their + * leases. This gives 10 sec for the pids to exit from sigkill, + * sanlock to get the exit statuses, clear the expiring wdmd connection, + * and hopefully have wdmd ping the watchdog again before it fires. + * A graceful shutdown period of less than 10/15 sec seems pointless, + * so if there is anything less than 10/15 sec available for a graceful + * shutdown we don't bother and go directly to sigkill (this could + * of course be changed if programs are indeed able to respond + * quickly during graceful shutdown.) + */ + if (!com.kill_grace_set && (com.watchdog_fire_timeout < DEFAULT_WATCHDOG_FIRE_TIMEOUT)) { + if (com.watchdog_fire_timeout < 60 && com.watchdog_fire_timeout >= 30) + com.kill_grace_seconds = 15; + else if (com.watchdog_fire_timeout < 30) + com.kill_grace_seconds = 0; + } +} + +/* + * Some of these timeouts depend on the the io_timeout used by *another* + * host, passed as the arg, not the local io_timeout. + */ + +/* All hosts are required to use the same watchdog_fire_timeout. */ int calc_host_dead_seconds(int io_timeout) { - /* id_renewal_fail_seconds + WATCHDOG_FIRE_TIMEOUT */ - return (8 * io_timeout) + WATCHDOG_FIRE_TIMEOUT; + /* id_renewal_fail_seconds + com.watchdog_fire_timeout */ + return (8 * io_timeout) + com.watchdog_fire_timeout; } int calc_id_renewal_seconds(int io_timeout) @@ -65,7 +105,7 @@ void log_timeouts(int io_timeout_arg) /* those above are chosen by us, the rest are based on them */ - int host_dead_seconds = id_renewal_fail_seconds + WATCHDOG_FIRE_TIMEOUT; + int host_dead_seconds = id_renewal_fail_seconds + com.watchdog_fire_timeout; int delta_large_delay = id_renewal_seconds + (6 * io_timeout_seconds); int delta_short_delay = 2 * io_timeout_seconds; diff --git a/src/timeouts.h b/src/timeouts.h index c6dde69..ac9842a 100644 --- a/src/timeouts.h +++ b/src/timeouts.h @@ -112,6 +112,244 @@ * that sanlock cannot successfully kill the pids it is supervising that * depend on the given host_id. * + * This analyzes the sanlock and wdmd operations every 5 seconds, and + * assumes that the sanlock and wdmd daemons are both performing their + * steps right at each 5 second mark, but in reality they will likely be + * offset from each other. + * + * + * Using these values in the example + * wdmd test interval = 5 + * watchdog_fire_timeout = 30 + * io_timeout_seconds = 5 + * id_renewal_seconds = 10 + * id_renewal_fail_seconds = 40 + * host_dead_seconds = 70 + * + * wdmd_test_live(renewal_time [now], + * expire_time [now + id_renewal_fail_seconds]) + * + * T time in seconds (now) + * + * 0: sanlock renews host_id on disk + * sanlock calls wdmd_test_live(0, 40) [expire 40 from 0 + 40] + * wdmd test_client sees now 0 < expire 40 ok -> keepalive + * + * 5: wdmd test_client sees now 5 < expire 40 ok -> keepalive + * + * 10: sanlock renews host_id on disk ok + * sanlock calls wdmd_test_live(10, 50) [expire 50 from 10 + 40] + * wdmd test_client sees now 10 < expire 50 or 40 ok -> keepalive + * (50 if the wdmd check is right after this wdmd_test_live, or + * (40 if the wdmd check is right before this wdmd_test_live) + * + * 15: wdmd test_client sees now 15 < expire 50 ok -> keepalive + * + * 20: sanlock renews host_id on disk ok + * sanlock calls wdmd_test_live(20, 60) [expire 60 from 20 + 40] + * wdmd test_client sees now 20 < expire 60 or 50 ok -> keepalive + * + * 25: wdmd test_client sees now 25 < expire 60 ok -> keepalive + * + * all normal until 29 + * --------------------------------------------------------- + * problems begin at 30 + * + * 30: sanlock fails to renew host_id on disk -> no wdmd_test_live + * wdmd test_client sees now 30 < expire 60 ok -> keepalive + * + * 35: sanlock fails to renew host_id on disk -> no wdmd_test_live + * wdmd test_client sees now 35 < expire 60 ok -> keepalive + * + * 40: sanlock fails to renew host_id on disk -> no wdmd_test_live + * wdmd test_client sees now 40 < expire 60 ok -> keepalive + * + * 45: sanlock fails to renew host_id on disk -> no wdmd_test_live + * wdmd test_client sees now 45 < expire 60 ok -> keepalive + * + * 50: sanlock fails to renew host_id on disk -> no wdmd_test_live + * wdmd test_client sees now 50 < expire 60 ok -> keepalive + * messages: check_our_lease warning (sanlock) + * + * 55: sanlock fails to renew host_id on disk -> no wdmd_test_live + * wdmd test_client sees now 55 < expire 60 ok -> keepalive (from dev close) + * (wdmd sees now >= expire - test_interval) + * messages: watchdog closed unclean (wdmd), test warning (wdmd) + * + * 60: sanlock fails to renew host_id on disk -> no wdmd_test_live + * sanlock enters recovery mode and starts killing pids because we have reached + * now (60) is id_renewal_fail_seconds (40) after last renewal (20) + * wdmd test_client sees now 60 >= expire 60 fail -> no keepalive + * messages: check_our_lease failed (sanlock), test failed (wdmd) + * + * . /dev/watchdog will fire at last keepalive + watchdog_fire_timeout = + * T55 + 30 = T85 + * . host_id will expire at + * last disk renewal ok + id_renewal_fail_seconds + watchdog_fire_timeout + * T20 + 40 + 30 = T90 + * (aka last disk renewal ok + host_dead_seconds, T20 + 70 = T90) + * . the wdmd test at T55 could have been at T59, so wdmd would have + * seen the client unexpired/ok and done keepalive at 59 just before the + * expiry at 60, which would lead to /dev/watchdog firing at 59+30 = T89 + * . so, the watchdog could fire as early as T85 or as late as T89, but + * the host_id will not expire until T90 + * + * 65: sanlock fails to renew host_id on disk -> no wdmd_test_live + * wdmd test_client sees now 65 > expire 60 fail -> no keepalive + * + * 70: sanlock fails to renew host_id on disk -> no wdmd_test_live + * wdmd test_client sees now 70 > expire 60 fail -> no keepalive + * + * 75: sanlock fails to renew host_id on disk -> no wdmd_test_live + * wdmd test_client sees now 75 > expire 60 fail -> no keepalive + * + * 80: sanlock fails to renew host_id on disk -> no wdmd_test_live + * wdmd test_client sees now 80 > expire 60 fail -> no keepalive + * + * 85: sanlock fails to renew host_id on disk -> no wdmd_test_live + * wdmd test_client sees now 85 > expire 60 fail -> no keepalive + * /dev/watchdog fires because last keepalive was T55, 30 seconds ago + * (earliest possible /dev/watchdog firing due to wdmd checking expiry just + * after sanlock calls wdmd_test_live at T55 and just after the expiry at T60) + * + * 89: (latest possible /dev/watchdog firing due to wdmd checking expiry just + * before the expiry at T59) + * + * 90: another host can acquire leases held by host_id. + * This is host_dead_seconds (70) after the last successful renewal (T20) + * -- + * + * entering recovery mode at 60 until watchdog firing at 85 is 25 seconds + * to hopefully clear client usage of leases and avert the watchdog firing. + * kill_grace_seconds is 15, leaving clients 15 seconds to do a graceful + * shutdown using their killpath or respond to sigterm. If the client + * hasn't dropped its leases in these 15 seconds, sanlock escalates to + * using sigkill with 10 seconds remaining until the watchdog fires. + * 10 seconds is hopefully long enough for client pids to exit, sanlock + * to collect the exit status and clear the wdmd connection before the + * watchdog really fires. + */ + +/* + * Example of watchdog behavior when host_id renewals fail, assuming + * that sanlock cannot successfully kill the pids it is supervising that + * depend on the given host_id. + * + * This analyzes the sanlock and wdmd operations every 2 seconds, and + * assumes that the sanlock and wdmd daemons are both performing their + * steps right at each 2 second mark, but in reality they will likely be + * offset from each other. + * + * Using these values in the example + * wdmd test interval = 2 + * watchdog_fire_timeout = 10 + * io_timeout_seconds = 2 + * id_renewal_seconds = 4 + * id_renewal_fail_seconds = 16 + * host_dead_seconds = 26 + * + * wdmd_test_live(renewal_time [now], + * expire_time [now + id_renewal_fail_seconds]) + * + * T time in seconds (now) + * + * 0: sanlock renews host_id on disk + * sanlock calls wdmd_test_live(0, 16) [expire 16 from 0 + 16] + * wdmd test_client sees now 0 < expire 16 ok -> keepalive + * + * 2: wdmd test_client sees now 2 < expire 16 ok -> keepalive + * + * 4: sanlock renews host_id on disk ok + * sanlock calls wdmd_test_live(4, 20) [expire 20 from 4 + 16] + * wdmd test_client sees now 4 < expire 16 or 20 ok -> keepalive + * + * 6: wdmd test_client sees now 6 < expire 20 ok -> keepalive + * + * 8: sanlock renews host_id on disk ok + * sanlock calls wdmd_test_live(8, 24) [expire 24 from 8 + 16] + * wdmd test_client sees now 8 < expire 20 or 24 ok -> keepalive + * + * 10: wdmd test_client sees now 10 < expire 24 ok -> keepalive + * + * all normal until 11 + * --------------------------------------------------------- + * problems begin at 12 + * + * 12: sanlock fails to renew host_id on disk -> no wdmd_test_live + * wdmd test_client sees now 12 < expire 24 ok -> keepalive + * + * 14: sanlock fails to renew host_id on disk -> no wdmd_test_live + * wdmd test_client sees now 14 < expire 24 ok -> keepalive + * + * 16: sanlock fails to renew host_id on disk -> no wdmd_test_live + * wdmd test_client sees now 16 < expire 24 ok -> keepalive + * + * 18: sanlock fails to renew host_id on disk -> no wdmd_test_live + * wdmd test_client sees now 18 < expire 24 ok -> keepalive + * + * 20: sanlock fails to renew host_id on disk -> no wdmd_test_live + * wdmd test_client sees now 20 < expire 24 ok -> keepalive + * messages: check_our_lease warning (sanlock) + * (these warnings appear 6*io_timeout after last renewal) + * + * 22: sanlock fails to renew host_id on disk -> no wdmd_test_live + * wdmd test_client sees now 22 < expire 24 ok -> keepalive (from dev close) + * (wdmd sees now >= expire - test_interval) + * messages: watchdog closed unclean (wdmd), test warning (wdmd) + * + * 24: sanlock fails to renew host_id on disk -> no wdmd_test_live + * sanlock enters recovery mode and starts killing pids because we have reached + * now (24) is id_renewal_fail_seconds (16) after last renewal (8) + * wdmd test_client sees now 24 >= expire 24 fail -> no keepalive + * messages: check_our_lease failed (sanlock), test failed (wdmd) + * + * . /dev/watchdog will fire at last keepalive + watchdog_fire_timeout = + * T22 + 10 = T32 + * . host_id will expire at + * last disk renewal ok + id_renewal_fail_seconds + watchdog_fire_timeout + * T8 + 16 + 10 = T34 + * (aka last disk renewal ok + host_dead_seconds, T8 + 26 = T34) + * . the wdmd test at T22 could have been at T23, so wdmd would have + * seen the client unexpired/ok and done keepalive at 23 just before the + * expiry at 24, which would lead to /dev/watchdog firing at 23+10 = T33 + * . so, the watchdog could fire as early as T32 or as late as T33, but + * the host_id will not expire until T34 + * + * 26: sanlock fails to renew host_id on disk -> no wdmd_test_live + * wdmd test_client sees now 26 > expire 24 fail -> no keepalive + * + * 28: sanlock fails to renew host_id on disk -> no wdmd_test_live + * wdmd test_client sees now 28 > expire 24 fail -> no keepalive + * + * 30: sanlock fails to renew host_id on disk -> no wdmd_test_live + * wdmd test_client sees now 30 > expire 24 fail -> no keepalive + * + * 32: sanlock fails to renew host_id on disk -> no wdmd_test_live + * wdmd test_client sees now 32 > expire 24 fail -> no keepalive + * /dev/watchdog fires because last keepalive was T22, 10 seconds ago + * (earliest possible /dev/watchdog firing due to wdmd checking expiry just + * after sanlock calls wdmd_test_live at T22 and just after the expiry at T24) + * + * 33: (latest possible /dev/watchdog firing due to wdmd checking expiry just + * before the expiry at T23) + * + * 34: another host can acquire leases held by host_id. + * This is host_dead_seconds (26) after the last successful renewal (T8) + * + * -- + * + * entering recovery mode at 24 until watchdog firing at 32 is only 8 seconds, + * so there is no time for graceful recovery, so kill_grace_seconds would be + * set to 0 here. All 8 seconds would be used to hopefully complete sigkill, + * collect client exit statuses, and clear the expiring wdmd connection + * before the watchdog actually fires. + */ + +/* + * Example of watchdog behavior when host_id renewals fail, assuming + * that sanlock cannot successfully kill the pids it is supervising that + * depend on the given host_id. + * * * Using these values in the example * wdmd test interval = 10 (defined in wdmd/main.c) @@ -213,6 +451,17 @@ * * 300: another host can acquire leases held by host_id * This is host_dead_seconds (220) after last successful renewal (T80) + * -- + * + * entering recovery mode at 240 until watchdog firing at 290 is 50 seconds + * to hopefully clear client usage of leases and avert the watchdog firing. + * kill_grace_seconds is 40, leaving clients 40 seconds to do a graceful + * shutdown using their killpath or respond to sigterm. If the client + * hasn't dropped its leases in these 40 seconds, sanlock escalates to + * using sigkill with 10 seconds remaining until the watchdog fires. + * 10 seconds is hopefully long enough for client pids to exit, sanlock + * to collect the exit status and clear the wdmd connection before the + * watchdog really fires. */ @@ -448,6 +697,7 @@ #ifndef __TIMEOUTS_H__ #define __TIMEOUTS_H__ +void setup_timeouts(void); int calc_host_dead_seconds(int io_timeout); int calc_id_renewal_seconds(int io_timeout); int calc_id_renewal_fail_seconds(int io_timeout); diff --git a/src/watchdog.c b/src/watchdog.c index 2c6c5b8..d0077dd 100644 --- a/src/watchdog.c +++ b/src/watchdog.c @@ -39,6 +39,24 @@ #include "../wdmd/wdmd.h" +/* tell wdmd to open the watchdog device, set the fire timeout and begin keepalives */ +int open_watchdog(int con, int fire_timeout) +{ + int rv; + + if (!com.use_watchdog) + return 0; + + rv = wdmd_open_watchdog(con, fire_timeout); + if (rv < 0) { + log_error("wdmd_open_watchdog fire_timeout %d error", fire_timeout); + return -1; + } + + return 0; +} + +/* tell wdmd that this connection is still good and watchdog pings can continue for it */ void update_watchdog(struct space *sp, uint64_t timestamp, int id_renewal_fail_seconds) { @@ -53,6 +71,7 @@ void update_watchdog(struct space *sp, uint64_t timestamp, (unsigned long long)timestamp, rv); } +/* connects to the wdmd daemon */ int connect_watchdog(struct space *sp) { int con; @@ -69,6 +88,7 @@ int connect_watchdog(struct space *sp) return con; } +/* associate wdmd keepalives to the continued liveness of this lockspace */ int activate_watchdog(struct space *sp, uint64_t timestamp, int id_renewal_fail_seconds, int con) { @@ -105,9 +125,9 @@ int activate_watchdog(struct space *sp, uint64_t timestamp, goto fail_clear; } - if (fire_timeout != WATCHDOG_FIRE_TIMEOUT) { + if (fire_timeout != com.watchdog_fire_timeout) { log_erros(sp, "wdmd invalid fire_timeout %d vs %d", - fire_timeout, WATCHDOG_FIRE_TIMEOUT); + fire_timeout, com.watchdog_fire_timeout); goto fail_clear; } @@ -153,7 +173,7 @@ void deactivate_watchdog(struct space *sp) wdmd_refcount_clear(sp->wd_fd); } -void close_watchdog(struct space *sp) +void disconnect_watchdog(struct space *sp) { if (!com.use_watchdog) return; diff --git a/src/watchdog.h b/src/watchdog.h index a462559..b872757 100644 --- a/src/watchdog.h +++ b/src/watchdog.h @@ -9,12 +9,20 @@ #ifndef __WATCHDOG_H__ #define __WATCHDOG_H__ -void update_watchdog(struct space *sp, uint64_t timestamp, - int id_renewal_fail_seconds); +/* open/close socket connection to wdmd daemon */ int connect_watchdog(struct space *sp); +void disconnect_watchdog(struct space *sp); + +/* tell wdmd to open the watchdog device which arms it + and wdmd begins keepalive loop, but the watchdog + keepalive is not yet influenced by lockspace renewals. */ +int open_watchdog(int con, int fire_timeout); + +/* associate per-lockspace renewals in sanlock with + watchdog petting in wdmd */ int activate_watchdog(struct space *sp, uint64_t timestamp, int id_renewal_fail_seconds, int con); void deactivate_watchdog(struct space *sp); -void close_watchdog(struct space *sp); - +void update_watchdog(struct space *sp, uint64_t timestamp, + int id_renewal_fail_seconds); #endif diff --git a/wdmd/client.c b/wdmd/client.c index 87fcd40..be5278a 100644 --- a/wdmd/client.c +++ b/wdmd/client.c @@ -77,6 +77,29 @@ static int send_header(int con, int cmd) return 0; } +int wdmd_open_watchdog(int con, int fire_timeout) +{ + struct wdmd_header h; + int rv; + + memset(&h, 0, sizeof(h)); + h.cmd = CMD_OPEN_WATCHDOG; + h.fire_timeout = fire_timeout; + + rv = send(con, (void *)&h, sizeof(struct wdmd_header), 0); + if (rv < 0) + return -errno; + + memset(&h, 0, sizeof(h)); + rv = recv(con, &h, sizeof(h), MSG_WAITALL); + if (rv < 0) + return -errno; + + if (h.fire_timeout != fire_timeout) + return -1; + return 0; +} + int wdmd_refcount_set(int con) { return send_header(con, CMD_REFCOUNT_SET); diff --git a/wdmd/main.c b/wdmd/main.c index aebacbe..e89ab5e 100644 --- a/wdmd/main.c +++ b/wdmd/main.c @@ -56,16 +56,20 @@ #define WDPATH_SIZE 64 -static int test_interval = DEFAULT_TEST_INTERVAL; +static int standard_test_interval = DEFAULT_TEST_INTERVAL; +static int test_interval= DEFAULT_TEST_INTERVAL; static int fire_timeout = DEFAULT_FIRE_TIMEOUT; static int high_priority = DEFAULT_HIGH_PRIORITY; static int daemon_quit; static int daemon_debug; +static int try_timeout; +static int forcefire; static int socket_gid; static char *socket_gname = (char *)SOCKET_GNAME; static time_t last_keepalive; static time_t last_closeunclean; static char lockfile_path[PATH_MAX]; +static int test_loop_enable; static int dev_fd = -1; static int shm_fd; @@ -334,6 +338,155 @@ static void dump_debug(int fd) send(fd, debug_buf, debug_len, MSG_NOSIGNAL); } +static void _init_test_interval(void) +{ + if (fire_timeout >= 60) { + standard_test_interval = 10; + test_interval = 10; + } else if (fire_timeout >= 30 && fire_timeout < 60) { + standard_test_interval = 5; + test_interval = 5; + } else if (fire_timeout >= 10 && fire_timeout < 30) { + standard_test_interval = 2; + test_interval = 2; + } else { + standard_test_interval = 1; + test_interval = 1; + } +} + +static int open_dev(void) +{ + int fd; + + if (dev_fd != -1) { + log_error("watchdog already open fd %d", dev_fd); + return -1; + } + + fd = open(watchdog_path, O_WRONLY | O_CLOEXEC); + if (fd < 0) { + log_error("open %s error %d", watchdog_path, errno); + return fd; + } + + dev_fd = fd; + return 0; +} + +static void close_watchdog(void) +{ + int rv; + + if (dev_fd == -1) { + log_debug("close_watchdog already closed"); + return; + } + + rv = write(dev_fd, "V", 1); + if (rv < 0) + log_error("%s disarm write error %d", watchdog_path, errno); + else + log_error("%s disarmed", watchdog_path); + + close(dev_fd); + dev_fd = -1; +} + +static void close_watchdog_unclean(void) +{ + if (dev_fd == -1) + return; + + log_error("%s closed unclean", watchdog_path); + close(dev_fd); + dev_fd = -1; + + last_closeunclean = monotime(); +} + +static void pet_watchdog(void) +{ + int rv, unused; + + rv = ioctl(dev_fd, WDIOC_KEEPALIVE, &unused); + + last_keepalive = monotime(); + log_debug("keepalive %d", rv); +} + +static int _open_watchdog(struct wdmd_header *h) +{ + int get_timeout, set_timeout; + int rv; + + /* Don't check dev_fd for -1 because dev_fd will be closed + and set to -1 prior to timeout in close_watchdog_unclean(). */ + + if (test_loop_enable) + return 0; + + if (!h->fire_timeout) + return -1; + + rv = open_dev(); + if (rv < 0) + return -1; + + get_timeout = 0; + + rv = ioctl(dev_fd, WDIOC_GETTIMEOUT, &get_timeout); + if (rv < 0) { + log_error("open_watchdog gettimeout error %d", errno); + close_watchdog(); + return -1; + } + + if (get_timeout == h->fire_timeout) { + /* success, requested value matches the default value */ + fire_timeout = get_timeout; + _init_test_interval(); + log_error("%s open with timeout %d", watchdog_path, get_timeout); + pet_watchdog(); + test_loop_enable = 1; + return 0; + } + + set_timeout = h->fire_timeout; + + rv = ioctl(dev_fd, WDIOC_SETTIMEOUT, &set_timeout); + if (rv < 0) { + log_error("open_watchdog settimeout %d error %d", set_timeout, errno); + close_watchdog(); + return -1; + } + + get_timeout = 0; + + rv = ioctl(dev_fd, WDIOC_GETTIMEOUT, &get_timeout); + if (rv < 0) { + log_error("open_watchdog gettimeout check error %d", errno); + close_watchdog(); + return -1; + } + + if (get_timeout == set_timeout) { + /* success setting a custom timeout */ + fire_timeout = get_timeout; + _init_test_interval(); + log_error("%s open with timeout %d", watchdog_path, get_timeout); + pet_watchdog(); + test_loop_enable = 1; + return 0; + } + + /* failed to set a custom timeout */ + log_error("open_watchdog gettimeout value %d set %d", + get_timeout, set_timeout); + close_watchdog(); + return -1; +} + static void process_connection(int ci) { struct wdmd_header h; @@ -378,6 +531,17 @@ static void process_connection(int ci) client[ci].refcount = 0; break; + case CMD_OPEN_WATCHDOG: + memcpy(&h_ret, &h, sizeof(h)); + rv = _open_watchdog(&h); + if (rv < 0) + h_ret.fire_timeout = 0; + else + h_ret.fire_timeout = fire_timeout; + log_debug("open_watchdog fire_timeout %u result %u", h.fire_timeout, fire_timeout); + send(client[ci].fd, &h_ret, sizeof(h_ret), MSG_NOSIGNAL); + break; + case CMD_TEST_LIVE: client[ci].renewal = h.renewal_time; client[ci].expire = h.expire_time; @@ -509,7 +673,7 @@ static int test_clients(void) if (t >= client[i].expire) { log_error("test failed rem %d now %llu ping %llu close %llu renewal %llu expire %llu client %d %s", - DEFAULT_FIRE_TIMEOUT - (int)(t - last_ping), + fire_timeout - (int)(t - last_ping), (unsigned long long)t, (unsigned long long)last_keepalive, (unsigned long long)last_closeunclean, @@ -540,7 +704,7 @@ static int test_clients(void) * expiration time. */ - if (t >= client[i].expire - DEFAULT_TEST_INTERVAL) { + if (t >= client[i].expire - standard_test_interval) { log_error("test warning now %llu ping %llu close %llu renewal %llu expire %llu client %d %s", (unsigned long long)t, (unsigned long long)last_keepalive, @@ -791,7 +955,7 @@ static int test_scripts(void) */ if (!scripts[i].last_result && - ((begin - scripts[i].start) < (DEFAULT_TEST_INTERVAL - 1))) + ((begin - scripts[i].start) < (standard_test_interval - 1))) continue; pid = run_script(i); @@ -806,7 +970,7 @@ static int test_scripts(void) } } - /* wait up to DEFAULT_TEST_INTERVAL-1 for the pids to finish */ + /* wait up to standard_test_interval-1 for the pids to finish */ while (1) { running = 0; @@ -893,7 +1057,7 @@ static int test_scripts(void) if (!running) break; - if (monotime() - begin >= DEFAULT_TEST_INTERVAL - 1) + if (monotime() - begin >= standard_test_interval - 1) break; sleep(1); @@ -927,58 +1091,6 @@ static int test_scripts(void) return fail_count; } -static int open_dev(void) -{ - int fd; - - if (dev_fd != -1) { - log_error("watchdog already open fd %d", dev_fd); - return -1; - } - - fd = open(watchdog_path, O_WRONLY | O_CLOEXEC); - if (fd < 0) { - log_error("open %s error %d", watchdog_path, errno); - return fd; - } - - dev_fd = fd; - return 0; -} - -static void close_watchdog_unclean(void) -{ - if (dev_fd == -1) { - log_debug("close_watchdog_unclean already closed"); - return; - } - - log_error("%s closed unclean", watchdog_path); - close(dev_fd); - dev_fd = -1; - - last_closeunclean = monotime(); -} - -static void close_watchdog(void) -{ - int rv; - - if (dev_fd == -1) { - log_error("close_watchdog already closed"); - return; - } - - rv = write(dev_fd, "V", 1); - if (rv < 0) - log_error("%s disarm write error %d", watchdog_path, errno); - else - log_error("%s disarmed", watchdog_path); - - close(dev_fd); - dev_fd = -1; -} - static int _setup_watchdog(char *path) { struct stat buf; @@ -1004,25 +1116,9 @@ static int _setup_watchdog(char *path) return -1; } - if (timeout == fire_timeout) - goto out; - - timeout = fire_timeout; - - rv = ioctl(dev_fd, WDIOC_SETTIMEOUT, &timeout); - if (rv < 0) { - log_error("%s failed to set timeout", watchdog_path); - close_watchdog(); - return -1; - } + log_debug("%s gettimeout reported %u", watchdog_path, timeout); - if (timeout != fire_timeout) { - log_error("%s failed to set new timeout", watchdog_path); - close_watchdog(); - return -1; - } - out: - log_error("%s armed with fire_timeout %d", watchdog_path, fire_timeout); + close_watchdog(); /* TODO: save watchdog_path in /run/wdmd/saved_path, * and in startup read that file, copying it to saved_path */ @@ -1091,7 +1187,103 @@ static int setup_watchdog(void) } -static int probe_dev(const char *path) +static int _try_timeout(const char *path) +{ + struct stat buf; + int get_timeout, set_timeout; + int unused, fd, err, rv; + + rv = stat(path, &buf); + if (rv < 0) { + fprintf(stderr, "%s stat error %d\n", path, errno); + return -1; + } + + fd = open(path, O_WRONLY | O_CLOEXEC); + if (fd < 0) { + fprintf(stderr, "%s open error %d\n", path, errno); + return fd; + } + + get_timeout = 0; + + rv = ioctl(fd, WDIOC_GETTIMEOUT, &get_timeout); + if (rv < 0) { + fprintf(stderr, "%s gettimeout error %d\n", path, errno); + rv = -1; + goto out; + } + + printf("%s gettimeout %d\n", path, get_timeout); + + set_timeout = try_timeout; + + rv = ioctl(fd, WDIOC_SETTIMEOUT, &set_timeout); + if (rv < 0) { + fprintf(stderr, "%s settimeout %d error %d\n", path, set_timeout, errno); + rv = -1; + goto out; + } + + printf("%s settimeout %d result %d\n", path, try_timeout, set_timeout); + + if (set_timeout != try_timeout) { + fprintf(stderr, "%s settimeout %d failed\n", path, try_timeout); + rv = -1; + goto out; + } + + get_timeout = 0; + + rv = ioctl(fd, WDIOC_GETTIMEOUT, &get_timeout); + if (rv < 0) { + fprintf(stderr, "%s gettimeout error %d\n", path, errno); + rv = -1; + goto out; + } + + printf("%s gettimeout %d\n", path, get_timeout); + + rv = ioctl(fd, WDIOC_KEEPALIVE, &unused); + if (rv < 0) { + fprintf(stderr, "%s keepalive error %d\n", path, errno); + rv = -1; + goto out; + } + + if (forcefire) { + int sleep_sec = 0; + int i; + setbuf(stdout, NULL); + printf("waiting for watchdog to reset machine:\n"); + for (i = 1; i < get_timeout + 5; i++) { + sleep(1); + sleep_sec++; + if (sleep_sec == get_timeout+1) { + printf("\n"); + printf("%d %s failed to fire after timeout %d seconds\n", i, path, get_timeout); + } else if (sleep_sec > get_timeout+1) { + printf("%d %s failed to fire after timeout %d seconds\n", i, path, get_timeout); + } else { + printf("%d ", i); + } + } + } + + rv = 0; + out: + err = write(fd, "V", 1); + if (err < 0) { + fprintf(stderr, "trytimeout failed to disarm %s error %d %d\n", path, err, errno); + openlog("wdmd", LOG_CONS | LOG_PID, LOG_DAEMON); + syslog(LOG_ERR, "trytimeout failed to disarm %s error %d %d\n", path, err, errno); + } + + close(fd); + return rv; +} + +static int _probe_dev(const char *path) { struct stat buf; int fd, err, rv, timeout; @@ -1153,6 +1345,14 @@ static int probe_dev(const char *path) return rv; } +static int probe_dev(const char *path) +{ + if (try_timeout) + return _try_timeout(path); + else + return _probe_dev(path); +} + static int probe_watchdog(void) { int rv; @@ -1205,16 +1405,6 @@ static int probe_watchdog(void) } -static void pet_watchdog(void) -{ - int rv, unused; - - rv = ioctl(dev_fd, WDIOC_KEEPALIVE, &unused); - - last_keepalive = monotime(); - log_debug("keepalive %d", rv); -} - static void process_signals(int ci) { struct signalfd_siginfo fdsi; @@ -1305,8 +1495,6 @@ static int test_loop(void) int fail_count; int rv, i; - pet_watchdog(); - test_time = 0; poll_timeout = test_interval * 1000; @@ -1335,6 +1523,12 @@ static int test_loop(void) if (daemon_quit && !active_clients()) break; + /* + * No client has called open_watchdog() so the wd device is not open yet. + */ + if (!test_loop_enable) + continue; + if (monotime() - test_time >= test_interval) { test_time = monotime(); log_debug("test_time %llu", @@ -1354,7 +1548,7 @@ static int test_loop(void) pet_watchdog(); } - test_interval = DEFAULT_TEST_INTERVAL; + test_interval = standard_test_interval; } else { /* If we can patch the kernel so that close does not generate a ping, then we can skip @@ -1513,19 +1707,21 @@ static void print_usage_and_exit(int status) { printf("Usage:\n"); printf("wdmd [options]\n\n"); - printf("--version, -V print version\n"); - printf("--help, -h print usage\n"); - printf("--dump, -d print debug from daemon\n"); - printf("--probe, -p print path of functional watchdog device\n"); - printf("-D debug: no fork and print all logging to stderr\n"); - printf("-H 0|1 use high priority features (1 yes, 0 no, default %d)\n", - DEFAULT_HIGH_PRIORITY); - printf("-G group ownership for the socket\n"); - printf("-S 0|1 allow script tests (default %d)\n", allow_scripts); - printf("-s path to scripts dir (default %s)\n", scripts_dir); - printf("-k kill unfinished scripts after num seconds (default %d)\n", - kill_script_sec); - printf("-w /dev/watchdog path to the watchdog device to try first\n"); + printf("--version, -V print version\n"); + printf("--help, -h print usage\n"); + printf("--dump, -d print debug from daemon\n"); + printf("--probe, -p print path of functional watchdog device\n"); + printf("--trytimeout, -t set the timeout value for watchdog device\n"); + printf("--forcefire, -F force watchdog to fire and reset machine, use with -t\n"); + printf("-D debug: no fork and print all logging to stderr\n"); + printf("-H 0|1 use high priority features (1 yes, 0 no, default %d)\n", + DEFAULT_HIGH_PRIORITY); + printf("-G group ownership for the socket\n"); + printf("-S 0|1 allow script tests (default %d)\n", allow_scripts); + printf("-s path to scripts dir (default %s)\n", scripts_dir); + printf("-k kill unfinished scripts after num seconds (default %d)\n", + kill_script_sec); + printf("-w path to the watchdog device to try first\n"); exit(status); } @@ -1553,14 +1749,16 @@ int main(int argc, char *argv[]) int option_index = 0; static struct option long_options[] = { - {"help", no_argument, 0, 'h' }, - {"probe", no_argument, 0, 'p' }, - {"dump", no_argument, 0, 'd' }, - {"version", no_argument, 0, 'V' }, - {0, 0, 0, 0 } + {"help", no_argument, 0, 'h' }, + {"probe", no_argument, 0, 'p' }, + {"dump", no_argument, 0, 'd' }, + {"trytimeout", required_argument, 0, 't' }, + {"forcefire", no_argument, 0, 'F' }, + {"version", no_argument, 0, 'V' }, + {0, 0, 0, 0 } }; - c = getopt_long(argc, argv, "hpdVDH:G:S:s:k:w:", + c = getopt_long(argc, argv, "hpdVDH:G:S:s:k:w:t:F", long_options, &option_index); if (c == -1) break; @@ -1572,6 +1770,13 @@ int main(int argc, char *argv[]) case 'p': do_probe = 1; break; + case 't': + do_probe = 1; + try_timeout = atoi(optarg); + break; + case 'F': + forcefire = 1; + break; case 'd': print_debug_and_exit(); break; diff --git a/wdmd/wdmd.8 b/wdmd/wdmd.8 index cc03be7..6cbb813 100644 --- a/wdmd/wdmd.8 +++ b/wdmd/wdmd.8 @@ -10,9 +10,9 @@ wdmd \- watchdog multiplexing daemon .SH DESCRIPTION This daemon opens /dev/watchdog and allows multiple independent sources to -detmermine whether each KEEPALIVE is done. Every test interval (10 +detmermine whether each KEEPALIVE is done. Every test interval (default 10 seconds), the daemon tests each source. If any test fails, the KEEPALIVE -is not done. In a standard configuration, the watchdog timer will reset +is not done. In the default configuration, the watchdog timer will reset the system if no KEEPALIVE is done for 60 seconds ("fire timeout"). This means that if a single test fails 5-6 times in row, the watchdog will fire and reset the system. With multiple test sources, fewer separate failures @@ -40,8 +40,8 @@ T60: watchdog fires, system resets T60, and the tests at T60 would not be run.) A crucial aspect to the design and function of wdmd is that if any single -source does not pass tests for the fire timeout, the watchdog is -guaranteed to fire, regardless of whether other sources on the system have +source does not pass the test for the length of the fire timeout, the watchdog +is guaranteed to fire, regardless of whether other sources on the system have passed or failed. A spurious reset due to the combined effects of multiple failing tests as shown above, is an accepted side effect. @@ -113,4 +113,13 @@ it is considered a failure. .BI \-w " path" The path to the watchdog device to try first. +.TP +.BI "\-\-trytimeout, \-t" " seconds" + Set the timeout for the watchdog device. Use this to check for supported + timeout values. + +.TP +.B \-\-forcefire, \-F + Force the watchdog to fire and reset the machine. Use with -t. + diff --git a/wdmd/wdmd.h b/wdmd/wdmd.h index b32598f..01725a9 100644 --- a/wdmd/wdmd.h +++ b/wdmd/wdmd.h @@ -13,6 +13,7 @@ #define WDMD_NAME_SIZE 128 int wdmd_connect(void); +int wdmd_open_watchdog(int con, int fire_timeout); int wdmd_register(int con, char *name); int wdmd_refcount_set(int con); int wdmd_refcount_clear(int con); diff --git a/wdmd/wdmd_sock.h b/wdmd/wdmd_sock.h index f5ed27e..b554631 100644 --- a/wdmd/wdmd_sock.h +++ b/wdmd/wdmd_sock.h @@ -20,6 +20,7 @@ enum { CMD_TEST_LIVE, CMD_STATUS, CMD_DUMP_DEBUG, + CMD_OPEN_WATCHDOG, }; struct wdmd_header {