From 179755a345a6dbca0875e91904adb92664669e9d Mon Sep 17 00:00:00 2001 From: David Teigland Date: Oct 08 2012 16:27:02 +0000 Subject: fence_sanlock: new code A fence agent that uses /dev/watchdog to reset hosts. Per-host sanlock leases on shared storage are used: - for hosts to detect that they are being fenced by someone if they haven't actually failed - for hosts to verify that fenced hosts have been reset by seeing their host_id leases expire (not be renewed on disk) - for hosts to know how long to wait after fencing before they can proceed, based on acquiring the expired lease of the fenced host Signed-off-by: David Teigland --- diff --git a/VERSION b/VERSION new file mode 100644 index 0000000..6b4950e --- /dev/null +++ b/VERSION @@ -0,0 +1 @@ +2.4 diff --git a/fence_sanlock/Makefile b/fence_sanlock/Makefile new file mode 100644 index 0000000..017f1a4 --- /dev/null +++ b/fence_sanlock/Makefile @@ -0,0 +1,44 @@ +TARGET1 = fence_sanlockd +TARGET2 = fence_sanlock + +SOURCE1 = fence_sanlockd.c +SOURCE2 = fence_sanlock.in + +CFLAGS += -D_GNU_SOURCE -g \ + -Wall \ + -Wformat \ + -Wformat-security \ + -Wnested-externs \ + -Wpointer-arith \ + -Wextra -Wshadow \ + -Wcast-align \ + -Wwrite-strings \ + -Waggregate-return \ + -Wstrict-prototypes \ + -Winline \ + -Wredundant-decls \ + -Wno-sign-compare \ + -Wp,-D_FORTIFY_SOURCE=2 \ + -fexceptions \ + -fasynchronous-unwind-tables \ + -fdiagnostics-show-option + +VER=$(shell cat ../VERSION) +CFLAGS += -DVERSION=\"$(VER)\" + +LDFLAGS = -lrt -laio -lblkid -lsanlock -lwdmd + +all: $(TARGET1) $(TARGET2) + +$(TARGET1): $(SOURCE1) + $(CC) $(CFLAGS) $(LDFLAGS) $(SOURCE1) -o $@ -L. -L../src + +$(TARGET2): $(SOURCE2) + cat $(SOURCE2) | sed \ + -e 's#@VERSION@#${VER}#g' \ + > $(TARGET2) + chmod 755 $(TARGET2) + +clean: + rm -f *.o *.so *.so.* $(TARGET1) $(TARGET2) + diff --git a/fence_sanlock/fence_sanlock.in b/fence_sanlock/fence_sanlock.in new file mode 100755 index 0000000..7ab8c39 --- /dev/null +++ b/fence_sanlock/fence_sanlock.in @@ -0,0 +1,269 @@ +#!/bin/bash + +# Copyright 2012 Red Hat, Inc. +# +# This copyrighted material is made available to anyone wishing to use, +# modify, copy, or redistribute it subject to the terms and conditions +# of the GNU General Public License v2 or (at your option) any later version. + +# cluster.conf +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# +# + +# setup (one time only): +# fence_sanlock sanlock_init $device +# +# startup: +# service wdmd start +# service sanlock start +# service cman start +# - fence_node -U -> fence_sanlock -o on -> fence_sanlockd + +max_hosts=128 +opts= +node= +action= +device= +host_id= +offset= + +help() { + echo "Usage:" + echo "" + echo "fence_sanlock [options]" + echo "" + echo "Options:" + echo " -n Name of node to operate on" + echo " -o Action: off (default), on, or status" + echo " (sanlock specific actions: sanlock_init)" + echo " -d sanlock shared storage for leases" + echo " -i sanlock host_id of node to operate on" + echo " -h Print this help, then exit" + echo " -V Print program version information, then exit" + echo "" + echo "stdin options:" + echo " action=" + echo " device=" + echo " host_id=" +} + +cli_options() { + while [ "$1" != "--" ]; do + case $1 in + -n) + node=$2 + shift + ;; + -o) + action=$2 + shift + ;; + -d) + device=$2 + shift + ;; + -i) + host_id=$2 + shift + ;; + -h) + help + exit 0 + ;; + -V) + echo "fence_sanlock version @VERSION@" + exit 0 + ;; + esac + shift + done +} + +stdin_options() { + while read line + do + key="$(echo $line | cut --delimiter== --fields=1)" + val="$(echo $line | cut --delimiter== --fields=2)" + + if [ "$key" == "action" ]; then + action=$val + elif [ "$key" == "device" ]; then + device=$val + elif [ "$key" == "host_id" ]; then + host_id=$val + fi + done +} + +if [ $# -eq 0 ]; then + stdin_options +else + opts=$(getopt n:o:d:i:hV $@) + if [ "$?" != 0 ]; then + help + exit 1 + fi + cli_options $opts +fi + +if [[ "$device" == "" ]]; then + echo "device arg required" + exit 1 +fi + +if [[ "$action" == "" ]]; then + echo "action arg required" + exit 1 +fi + + +if [[ "$action" == "on" ]]; then + + if [[ "$host_id" == "" ]]; then + echo "host_id arg required" + exit 1 + fi + + if [[ $host_id -eq 0 ]] || [[ $host_id -gt $max_hosts ]]; then + echo "host_id must be between 1 and $max_hosts" + return 1 + fi + + offset=`expr $host_id \* 1048576` + + # verify device has been initialized + + magic=`sanlock direct read_leader -r fence:h$host_id:$device:$offset 2>&1 | grep magic | awk '{print $NF}'` + if [[ $? -ne 0 ]]; then + echo "device not available or uninitialized" + return 1 + fi + + if [[ "$magic" != "0x6152010" ]]; then + echo "device bad magic $magic" + return 1 + fi + + /usr/sbin/fence_sanlockd -d $device -i $host_id + + # wait for fence_sanlockd to acquire the local lease; + # it can take minutes, and we can't allow fence_tool join + # until this is complete + + # FIXME: tune the 180 value + + for i in `seq 1 180`; do + sanlock client status | grep fence:h$host_id:$device:$offset 2>&1 > /dev/null + + # FIXME: check that r is really done being acquired? + # just appearing in output may not be enough + + if [[ $? -eq 0 ]]; then + break + fi + sleep 1 + done + + exit 0 +fi + +if [[ "$action" == "off" ]]; then + + if [[ "$host_id" == "" ]]; then + echo "host_id arg required" + exit 1 + fi + + pid=`pidof fence_sanlockd` + + offset=`expr $host_id \* 1048576` + + leader=`sanlock direct read_leader -r fence:h$host_id:$device:$offset 2>&1` + + owner_id=`echo "$leader" | grep owner_id | awk '{print $NF}'` + owner_gen=`echo "$leader" | grep owner_gen | awk '{print $NF}'` + ver=`echo "$leader" | grep lver | awk '{print $NF}'` + + # owner_id should equal host_id + + sanlock client request -r fence:h$host_id:$device:$offset:`expr $ver + 1` -f 2 + + while : + do + sanlock client acquire -r fence:h$host_id:$device:$offset -p $pid + if [[ $? -eq 0 ]]; then + # fence success + sanlock client release -r fence:h$host_id:$device:$offset -p $pid + exit 0 + fi + + sleep 5 + + # FIXME: there probably some errors here where we should just fail + + # FIXME: we should probably reread the leader, and if it's been + # reacquired cleanly by the host, I think we can quit with success + done + + exit 1 +fi + +if [[ "$action" == "status" ]]; then + + leader=`sanlock direct read_leader -r fence:h$host_id:$device:$offset 2>&1` + + if [[ $? -ne 0 ]]; then + echo "device not available or uninitialized" + exit 1 + fi + + timestamp=`echo "$leader" | grep timestamp | awk '{print $NF}'` + + if [[ $timestamp -eq 0 ]]; then + # lease is released, so host is "off" + exit 2 + fi + + # lease is held, so host is "on" + exit 0 +fi + +if [[ "$action" == "sanlock_init" ]]; then + + # initialize lease device + + sanlock direct init -s fence:0:$device:0 + + for host_id in `seq 1 $max_hosts`; do + offset=`expr $host_id \* 1048576` + sanlock direct init -r fence:h$host_id:$device:$offset + done + + exit 0 +fi + +echo "Unknown action: $action" +exit 1 + diff --git a/fence_sanlock/fence_sanlockd.c b/fence_sanlock/fence_sanlockd.c new file mode 100644 index 0000000..c6280c9 --- /dev/null +++ b/fence_sanlock/fence_sanlockd.c @@ -0,0 +1,553 @@ +/* + * Copyright 2012 Red Hat, Inc. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v2 or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "sanlock.h" +#include "sanlock_admin.h" +#include "sanlock_resource.h" +#include "wdmd.h" + +/* + * TODO: + * handle 4k disk blocks using sanlock_align() + * variable i/o timeouts? + * + * shutdown: how/when does SIGTERM happen? + * We could have our own init script which does nothing + * in start (since we're started by cman/fence_node), + * but sends SIGTERM in stop. Our init script would + * start before cman and stop after cman. In future + * we could have the init script start the daemon, + * but that would also require a new config scheme + * since we wouldn't be getting the device and host_id + * from cluster.conf via fence_node. + * + * simplest would be to have cman init just send us + * SIGTERM after fenced is done, but that's a + * dependency on / assumption of the cman environment + * which we like to avoid; fence agents are ideally + * independent of environment. + */ + +#define MAX_HOSTS 128 /* keep in sync with fence_sanlock definition */ + +#define LIVE_INTERVAL 5 +#define EXPIRE_INTERVAL 20 + +#define RUN_DIR "/var/run/fence_sanlockd" + +static char *prog_name = (char *)"fence_sanlockd"; +static int shutdown; +static int we_are_victim; +static int daemon_debug; +static int our_host_id; +static char device[PATH_MAX]; +static struct sanlk_lockspace ls; +static struct sanlk_resource *r; +static char rdbuf[sizeof(struct sanlk_resource) + sizeof(struct sanlk_disk)]; +static char lockfile_path[PATH_MAX]; + +struct client { + int used; + int fd; + void *workfn; + void *deadfn; +}; + +#define CLIENT_NALLOC 3 +static int client_maxi; +static int client_size = 0; +static struct client *client = NULL; +static struct pollfd *pollfd = NULL; + +#define log_debug(fmt, args...) \ +do { \ + if (daemon_debug) \ + fprintf(stderr, "%llu " fmt "\n", (unsigned long long)time(NULL), ##args); \ +} while (0) + +#define log_error(fmt, args...) \ +do { \ + log_debug(fmt, ##args); \ + syslog(LOG_ERR, fmt, ##args); \ +} while (0) + + +static uint64_t monotime(void) +{ + struct timespec ts; + + clock_gettime(CLOCK_MONOTONIC, &ts); + + return ts.tv_sec; +} + +static void client_alloc(void) +{ + int i; + + if (!client) { + client = malloc(CLIENT_NALLOC * sizeof(struct client)); + pollfd = malloc(CLIENT_NALLOC * sizeof(struct pollfd)); + } else { + client = realloc(client, (client_size + CLIENT_NALLOC) * + sizeof(struct client)); + pollfd = realloc(pollfd, (client_size + CLIENT_NALLOC) * + sizeof(struct pollfd)); + if (!pollfd) + log_error("can't alloc for pollfd"); + } + if (!client || !pollfd) + log_error("can't alloc for client array"); + + for (i = client_size; i < client_size + CLIENT_NALLOC; i++) { + memset(&client[i], 0, sizeof(struct client)); + client[i].fd = -1; + pollfd[i].fd = -1; + pollfd[i].revents = 0; + } + client_size += CLIENT_NALLOC; +} + +static int client_add(int fd, void (*workfn)(int ci), void (*deadfn)(int ci)) +{ + int i; + + if (!client) + client_alloc(); + again: + for (i = 0; i < client_size; i++) { + if (!client[i].used) { + client[i].used = 1; + client[i].workfn = workfn; + client[i].deadfn = deadfn; + client[i].fd = fd; + pollfd[i].fd = fd; + pollfd[i].events = POLLIN; + if (i > client_maxi) + client_maxi = i; + return i; + } + } + + client_alloc(); + goto again; +} + +static int lockfile(void) +{ + char buf[16]; + struct flock lock; + mode_t old_umask; + int fd, rv; + + old_umask = umask(0022); + rv = mkdir(RUN_DIR, 0775); + if (rv < 0 && errno != EEXIST) { + umask(old_umask); + return rv; + } + umask(old_umask); + + sprintf(lockfile_path, "%s/prog_name.pid", RUN_DIR); + + fd = open(lockfile_path, O_CREAT|O_WRONLY|O_CLOEXEC, 0644); + if (fd < 0) { + log_error("lockfile open error %s: %s", + lockfile_path, strerror(errno)); + return -1; + } + + lock.l_type = F_WRLCK; + lock.l_start = 0; + lock.l_whence = SEEK_SET; + lock.l_len = 0; + + rv = fcntl(fd, F_SETLK, &lock); + if (rv < 0) { + log_error("lockfile setlk error %s: %s", + lockfile_path, strerror(errno)); + goto fail; + } + + rv = ftruncate(fd, 0); + if (rv < 0) { + log_error("lockfile truncate error %s: %s", + lockfile_path, strerror(errno)); + goto fail; + } + + memset(buf, 0, sizeof(buf)); + snprintf(buf, sizeof(buf), "%d\n", getpid()); + + rv = write(fd, buf, strlen(buf)); + if (rv <= 0) { + log_error("lockfile write error %s: %s", + lockfile_path, strerror(errno)); + goto fail; + } + + return fd; + fail: + close(fd); + return -1; +} + +static void process_signals(int ci) +{ + struct signalfd_siginfo fdsi; + ssize_t rv; + int fd = client[ci].fd; + + rv = read(fd, &fdsi, sizeof(struct signalfd_siginfo)); + if (rv != sizeof(struct signalfd_siginfo)) { + return; + } + + if (fdsi.ssi_signo == SIGTERM) { + shutdown = 1; + } + + if (fdsi.ssi_signo == SIGUSR1) { + we_are_victim = 1; + } +} + +static int setup_signals(void) +{ + sigset_t mask; + int fd, rv; + + sigemptyset(&mask); + sigaddset(&mask, SIGTERM); + sigaddset(&mask, SIGUSR1); + + rv = sigprocmask(SIG_BLOCK, &mask, NULL); + if (rv < 0) + return rv; + + fd = signalfd(-1, &mask, 0); + if (fd < 0) + return -errno; + + client_add(fd, process_signals, NULL); + return 0; +} + +static void print_usage(void) +{ + printf("Usage:\n"); + printf("fence_sanlockd -d -i \n"); + printf("\n"); + printf("Options:\n"); + printf(" -D Enable debugging to stderr and don't fork\n"); + printf(" -d Path to shared storage with sanlock leases\n"); + printf(" -i Local sanlock host_id\n"); + printf(" -h Print this help, then exit\n"); + printf(" -V Print program version information, then exit\n"); +} + +int main(int argc, char *argv[]) +{ + void (*workfn) (int ci); + void (*deadfn) (int ci); + uint64_t live_time, now; + int poll_timeout; + int sleep_seconds; + int cont = 1; + int optchar; + int sock, con, rv, i; + + while (cont) { + optchar = getopt(argc, argv, "Dd:i:hV"); + + switch (optchar) { + case 'D': + daemon_debug = 1; + break; + case 'd': + strcpy(device, optarg); + break; + case 'i': + our_host_id = atoi(optarg); + break; + case 'h': + print_usage(); + exit(0); + case 'V': + printf("fence_sanlockd %s (built %s %s)\n", + VERSION, __DATE__, __TIME__); + exit(0); + case EOF: + cont = 0; + break; + default: + fprintf(stderr, "unknown option %c", optchar); + exit(1); + }; + } + + if (!our_host_id || our_host_id > MAX_HOSTS) { + daemon_debug = 1; + log_error("-i %d invalid, our_host_id must be between 1 and %d", our_host_id, MAX_HOSTS); + exit(1); + } + + if (!device[0]) { + daemon_debug = 1; + log_error("device arg required"); + exit(1); + } + + if (!daemon_debug) { + if (daemon(0, 0) < 0) { + fprintf(stderr, "cannot fork daemon\n"); + exit(EXIT_FAILURE); + } + } + + openlog(prog_name, LOG_CONS | LOG_PID, LOG_DAEMON); + + rv = lockfile(); + if (rv < 0) + goto out; + + rv = setup_signals(); + if (rv < 0) + goto out_lockfile; + + con = wdmd_connect(); + if (con < 0) { + log_error("wdmd connect error %d", con); + goto out_lockfile; + } + + rv = wdmd_register(con, (char *)"fence_sanlockd"); + if (rv < 0) { + log_error("wdmd register error %d", rv); + goto out_lockfile; + } + + rv = wdmd_refcount_set(con); + if (rv < 0) { + log_error("wdmd refcount error %d", rv); + goto out_lockfile; + } + + sock = sanlock_register(); + if (sock < 0) { + log_error("register error %d", sock); + goto out_refcount; + } + + rv = sanlock_restrict(sock, SANLK_RESTRICT_SIGKILL); + if (rv < 0) { + log_error("restrict error %d", sock); + goto out_refcount; + } + + memset(&ls, 0, sizeof(ls)); + sprintf(ls.host_id_disk.path, "%s", device); + strcpy(ls.name, "fence"); + ls.host_id = our_host_id; + + log_debug("add_lockspace begin"); + + rv = sanlock_add_lockspace(&ls, 0); + if (rv < 0) { + log_error("add_lockspace error %d", rv); + goto out_refcount; + } + + log_debug("add_lockspace done %d", rv); + + memset(rdbuf, 0, sizeof(rdbuf)); + r = (struct sanlk_resource *)&rdbuf; + strcpy(r->lockspace_name, "fence"); + sprintf(r->name, "h%d", our_host_id); + sprintf(r->disks[0].path, "%s", device); + r->disks[0].offset = our_host_id * 1048576; + r->num_disks = 1; + + log_debug("acquire begin"); + + rv = sanlock_acquire(sock, -1, 0, 1, &r, NULL); + if (rv < 0) { + log_error("acquire error %d", rv); + goto out_lockspace; + } + + log_debug("acquire done %d", rv); + + /* at this point we can be fenced by someone */ + + now = monotime(); + live_time = now; + log_debug("test live %llu", (unsigned long long)now); + rv = wdmd_test_live(con, now, now + EXPIRE_INTERVAL); + if (rv < 0) { + log_error("wdmd_test_live first error %d", rv); + goto out_release; + } + + sleep_seconds = live_time + LIVE_INTERVAL - monotime(); + poll_timeout = (sleep_seconds > 0) ? sleep_seconds * 1000 : 500; + + while (1) { + rv = poll(pollfd, client_maxi + 1, poll_timeout); + if (rv == -1 && errno == EINTR) + continue; + if (rv < 0) { + /* not sure */ + } + + for (i = 0; i <= client_maxi; i++) { + if (client[i].fd < 0) + continue; + if (pollfd[i].revents & POLLIN) { + workfn = client[i].workfn; + if (workfn) + workfn(i); + } + if (pollfd[i].revents & (POLLERR | POLLHUP | POLLNVAL)) { + deadfn = client[i].deadfn; + if (deadfn) + deadfn(i); + } + } + + now = monotime(); + + if (shutdown) { + /* + * FIXME: how to be sure that it's safe for us to shut + * down? If service cman stop sends us SIGTERM only + * after cleanly stopping dlm and fenced, then it is + * safe to just shut down here. + * + * If we want to handle the case of something else + * sending us a spurious SIGTERM when we still need to + * be running, then it's more difficult. + * + * There's no perfect solution, since what we're + * protecting with fencing is not precisely defined. + * Generally it's dlm/gfs/rgmanager via fenced, so we + * could check that fenced is not running or no dlm + * lockspaces exist (like fence_tool leave does). But + * these make assumptions about the cluster environment + * which we don't like to do in fence agents as much as + * possible. But in some other unknown environment, + * I'm don't know what we'd check to verify it's safe + * to shut down in that environment anyway. + * + * So, for now, just assume SIGTERM was sent + * appropriately and shut down. + */ + log_error("shutdown"); + rv = wdmd_test_live(con, 0, 0); + if (rv < 0) + log_error("wdmd_test_live 0 error %d", rv); + break; + + } else if (we_are_victim) { + /* + * The sanlock daemon has seen someone request our + * lease, which happens when they run fence_sanlock + * against us. In response to the request, our sanlock + * daemon has sent us SIGUSR1. + * + * Do not call wdmd_test_live, so wdmd will see our + * connection expire, and will quit petting the + * watchdog, which will then fire in 60 sec. sanlock + * continues renewing its host_id until the machine + * dies, and the node doing fencing will then be able + * to acquire our lease host_dead_seconds after our + * last sanlock renewal. + * + * TODO: we could eventually attempt to kill/unmount/etc + * anything using shared storage, and if that all works, + * then we could do a clean shutdown afterward. That would + * often not work, because dlm/gfs would be stuck in the + * kernel due to failure (cluster partition) that caused + * our fencing, and couldn't be forcibly cleaned up. + */ + log_error("we are being fenced, our watchdog will fire"); + + } else if (now - live_time >= LIVE_INTERVAL) { + /* + * How to pick the expire_time. From the perspective + * of fence_sanlockd the expire_time isn't really + * important. It should be far enough in the future + * so that it's: + * - after the next time we're going to call test_live, + * because our test_live calls are obviously meant to + * keep it from expiring + * - more than 10 seconds in the future because of a + * current quirk in wdmd, where it pre-emptively + * closes the wd 10 seconds before the actual expire + * time (see comments in wdmd for reason). So we + * want to be sure we renew at least 10 sec before + * the last expire time. + * + * It shouldn't be too long, because when we see we're + * being fenced, we'll quit calling test_live, and we + * want our watchdog to reset us in a fairly short amount + * time after that (this effects how long the fencing node + * has to wait.) The longer the expire_time we provide, + * the longer it'll take before wdmd sees it expire, quits + * petting the wd, and resets us. + * + * So, if we have set expire_time to 20 sec in the + * future, and we renew once every 5 sec, we have two + * chances to renew before a pre-emptive close. + */ + live_time = now; + log_debug("test live %llu", (unsigned long long)now); + rv = wdmd_test_live(con, now, now + EXPIRE_INTERVAL); + if (rv < 0) + log_error("wdmd_test_live error %d", rv); + } + + if (we_are_victim) { + poll_timeout = 10000; + } else { + sleep_seconds = live_time + LIVE_INTERVAL - monotime(); + poll_timeout = (sleep_seconds > 0) ? sleep_seconds * 1000 : 500; + } + } + + out_release: + sanlock_release(sock, -1, 0, 1, &r); + out_lockspace: + sanlock_rem_lockspace(&ls, 0); + out_refcount: + wdmd_refcount_clear(con); + out_lockfile: + unlink(lockfile_path); + out: + return rv; +} +