From 1989cf38102d5b6055405887ee9278a86ec1c4c0 Mon Sep 17 00:00:00 2001 From: David Teigland Date: Apr 09 2024 18:04:21 +0000 Subject: sanlock: fix release writing zero dblock values Incorrect use of release_token was leading to a case where zero values would be written into the dblock. As mentioned in other code comments, it is unsafe for a host to write zeros values into its dblock, because those zero values can be used by, and alter the results of, a paxos ballot that is still running on other hosts. This can lead to multiple hosts running a paxos ballot for the same leader version an reaching different results, i.e. multiple owners of the lease at the same time. This was observed when a host running a paxos ballot commits another host as the owner, so fails to acquire the lease for itself, and returns SANLK_ACQUIRE_OTHER. After this, the code calls release_token, which was mistakenly called in way that would write zeros to the dblock (and mblock). A new variant of release_token is now used to avoid writing the dblock in this case. Original report and diagnosis from 山吴. --- diff --git a/src/resource.c b/src/resource.c index cad8af5..7e81cdb 100644 --- a/src/resource.c +++ b/src/resource.c @@ -451,6 +451,37 @@ static int write_host_block(struct task *task, struct token *token, * values. */ if (pd) { + if (!pd->mbal || !pd->bal || !pd->inp) { + /* + * In general this shouldn't happen, but this warning + * has been observed in two cases so far: + * a. RETRACT_PAXOS error handling (TODO: fix this by + * rereading our dblock before writing it back with + * zero mblock values.) + * b. releasing lease when the last dblock that was + * returned from paxos_lease_acquire has zero bal + * and zero inp, because our last ballot was + * was aborted, and then we detected another host + * committed us as the owner. The other host committed + * us as the owner using a previous dblock that we wrote + * but had aborted. i.e. we did: + * 1. run_ballot write dblock with non-zero mbal,bal,inp + * 2. abort2 the ballot when we see a larger mbal + * 3. run_ballot write dblock with increased non-zero mbal, + * bal=0, inp=0 + * 4. abort1 the ballot when we see a larger mbal + * 5. read leader and see that another host committed + * the non-zero mbal,bal,inp we wrote in step 1. + * The "pd" we have here is the dblock we wrote in step 3, + * that includes zero bal and inp. + * I don't believe this is a problem. + */ + log_warnt(token, "write_host_block with zero dblock values %llu %llu %llu", + (unsigned long long)pd->mbal, + (unsigned long long)pd->bal, + (unsigned long long)pd->inp); + } + if (pd->inp && (pd->inp != token->host_id)) { /* This should never happen, sanity check. */ log_errot(token, "Ignore bad dblock while writing mblock %llu:%llu:%llu:%llu", @@ -984,6 +1015,12 @@ static int _release_token(struct task *task, struct token *token, goto out; } + if (nodisk && opened) { + log_token(token, "release_token close only"); + close_disks(token->disks, token->r.num_disks); + goto out; + } + if (nodisk) goto out; @@ -1000,18 +1037,20 @@ static int _release_token(struct task *task, struct token *token, r_flags, (unsigned long long)lver); /* - * In all cases we want to (or can) clear both dblock and mblock. - * - * Cases where we want to release ownership of the leader: - * . releasing ex lease !(r_flags & R_SHARED) - * . R_UNDO_SHARED - * . R_ERASE_ALL + * Cases where we want to release ownership of the leader + * (paxos_lease_release or release_disk): + * . normal release of ex lease !(r_flags & R_SHARED) + * where we should definitely be the leader owner. + * . R_UNDO_SHARED (only if we are actually leader owner) + * . R_ERASE_ALL (only if we are actually leader owner) * * Cases where we don't want to release ownership of the leader: * . releasing sh lease: (r_flags & R_SHARED) */ if (r_flags & R_ERASE_ALL) { + /* TODO: reread our dblock, then write it back with zeroed + mblock fields, so that we do not zero our dblock fields. */ rv = write_mblock_zero_dblock_release(task, token); if (rv < 0) { log_errot(token, "release_token erase all write_host_block %d", rv); @@ -1041,6 +1080,8 @@ static int _release_token(struct task *task, struct token *token, (unsigned long long)lver, rv); } else if (r_flags & R_UNDO_SHARED) { + /* TODO: reread our dblock, then write it back with zeroed + mblock fields, so that we do not zero our dblock fields. */ rv = write_mblock_zero_dblock_release(task, token); if (rv < 0) { log_errot(token, "release_token undo shared write_host_block %d", rv); @@ -1135,6 +1176,11 @@ static int _release_token(struct task *task, struct token *token, return SANLK_AIO_TIMEOUT; } +static int release_token_nodisk_opened(struct task *task, struct token *token) +{ + return _release_token(task, token, NULL, 1, 1); +} + static int release_token_nodisk(struct task *task, struct token *token) { return _release_token(task, token, NULL, 0, 1); @@ -1846,24 +1892,26 @@ int acquire_token(struct task *task, struct token *token, uint32_t cmd_flags, usleep(us); goto retry; } - /* zero r->leader means not owned and release will just close */ - release_token_opened(task, token); + /* We must not write zero dblock values from write_host_block()! */ + release_token_nodisk_opened(task, token); + if (com.quiet_fail) + log_token(token, "acquire_token error: %d held retried", rv); + else + log_errot(token, "acquire_token error: %d held retried", rv); return SANLK_ACQUIRE_SHRETRY; } if (com.quiet_fail) - log_token(token, "acquire_token held error %d", rv); + log_token(token, "acquire_token error: %d held", rv); else - log_errot(token, "acquire_token held error %d", rv); - /* zero r->leader means not owned and release will just close */ - release_token_opened(task, token); + log_errot(token, "acquire_token error: %d held", rv); + /* We must not write zero dblock values from write_host_block()! */ + release_token_nodisk_opened(task, token); return rv; } if (rv < 0 && !(token->flags & T_RETRACT_PAXOS)) { - log_token(token, "acquire_token disk error %d", rv); - r->flags &= ~R_SHARED; - /* zero r->leader means not owned and release will just close */ - release_token_opened(task, token); + log_token(token, "acquire_token error: %d", rv); + release_token_nodisk_opened(task, token); return rv; } @@ -1872,7 +1920,7 @@ int acquire_token(struct task *task, struct token *token, uint32_t cmd_flags, * We might own the lease, we don't know, so we need to try to * release on disk to avoid possibly having an orphan lease on disk. */ - log_errot(token, "acquire_token disk error %d RETRACT_PAXOS", rv); + log_errot(token, "acquire_token error: %d RETRACT_PAXOS", rv); r->flags &= ~R_SHARED; r->flags |= R_ERASE_ALL; memcpy(&r->leader, &leader, sizeof(struct leader_record));