From d737809488521c15bc60fac9e34a7b872d2ac94c Mon Sep 17 00:00:00 2001 From: David Teigland Date: Dec 11 2023 22:36:27 +0000 Subject: sanlock: add comment with timeout example --- diff --git a/src/timeouts.h b/src/timeouts.h index ac9842a..73b02f1 100644 --- a/src/timeouts.h +++ b/src/timeouts.h @@ -464,6 +464,83 @@ * watchdog really fires. */ +/* + * Example of how long it takes to acquire a lease that was held + * by a failed host. + * + * wdmd test interval = 5 + * watchdog_fire_timeout = 30 + * io_timeout_seconds = 10 + * id_renewal_seconds = 20 + * id_renewal_fail_seconds = 80 + * host_dead_seconds = 110 (id_renewal_fail_seconds + watchdog_fire_timeout) + * + * host that was killed + * -------------------- + * sanlock debug output: + * 13:31:44 7630 [2157]: s1 delta_renew begin read + * 13:31:44 7630 [2157]: s1 delta_renew begin write for new ts 7630 + * 13:31:44 7630 [2157]: s1 renewed 7630 delta_length 0 interval 20 + * + * + * wdmd debug output: + * 1702323101 keepalive 0 + * 1702323101 test_interval 5 sleep_seconds 5 poll_timeout 5000 + * 1702323104 test_live ci 2 renewal 7630 expire 7710 + * 1702323104 test_interval 5 sleep_seconds 3 poll_timeout 3000 + * + * + * host_dead_seconds (110) after last successful renewal (13:31:44) + * would be 13:33:34 as the earliest another host can take a lease. + * + * host that was trying to acquire lease held by dead host + * ------------------------------------------------------- + * running: while true; do date; lvchange -ay bb/lvol1; sleep 1; done + * Mon Dec 11 13:33:50 CST 2023 + * LV locked by other host: bb/lvol1 + * Failed to lock logical volume bb/lvol1. + * Mon Dec 11 13:33:51 CST 2023 + * + * + * sanlock debug output: + * 13:31:40 7584 [2342]: s1 renewed 7584 delta_length 0 interval 20 + * ... + * 13:32:00 7605 [2342]: s1 renewed 7605 delta_length 0 interval 21 + * ... + * in that renewal at 7605, we see the last renewal from the other host, + * which is shown above from the other host's logs at "13:31:44 7630". + * we record our own current time 7605 as the last live time for the dead host. + * our initial attempts to get a lease held by the dead host will fail + * as follows until host_dead_seconds has elapased: + * paxos_acquire owner 2 2 7431 host_status 2 2 7630 wait_start 7605 + * paxos_acquire owner 2 2 7431 no wait + * acquire_disk rv -247 lver 0 at 0 + * acquire_token disk error -247 + * release_token r_flags 8 lver 0 + * release_token done r_flags 8 + * cmd_acquire 2,9,2124 acquire_token -247 lease owned by other host + * + * after retrying for host_dead_seconds, it succeeds: + * 13:33:51 7716 [2282]: s1:r5 paxos_acquire begin offset 71303168 0xa 512 1048576 + * 13:33:51 7716 [2282]: s1:r5 leader 2 owner 2 2 7431 dblocks 1:2002:2002:2:2:7431:2:0, + * 13:33:51 7716 [2282]: s1:r5 paxos_acquire leader 2 owner 2 2 7431 max mbal[1] 2002 our_dblock 0 0 0 0 0 0 + * 13:33:51 7716 [2282]: s1:r5 paxos_acquire owner 2 2 7431 host_status 2 2 7630 wait_start 7605 + * 13:33:51 7716 [2282]: s1:r5 paxos_acquire owner 2 2 7431 delta 2 2 7630 dead 7716-7605>110 + * 13:33:51 7716 [2282]: s1:r5 paxos_acquire leader 2 owner 2 2 7431 + * + * This shows that the difference between the current time (7716) and the + * time it last saw the dead host alive (7605) is >= host_dead_seconds (110), + * so it's allowed to get the lease. + * + * The host was killed at 13:31:44 and its lease was acquired at 13:33:51. + * This is 127 seconds, which includes the 110 host_dead_seconds, plus + * 17 extra seconds that elapsed between the time the host was killed + * and the time that the other host ran its own renewal and recorded the + * final renewal of the dead host. + * The host that was killed last renewed at 13:31:44, and the other host's + * next renewal after that was at 13:32:00 which is a 16 second difference. + */ + /* * killing pids