From 2390c8f364a466534ceda324fb822741b892e075 Mon Sep 17 00:00:00 2001 From: David Teigland Date: Jan 17 2024 17:43:44 +0000 Subject: sanlock: improve repeated ballots in paxos_acquire The paxos implementation was inefficient in advancing the mbal between successive ballots. If the local mbal was much less than the mbal used by other hosts, the local mbal was increased by just num_hosts repeatedly until it reached the other mbal. Now we advance the local mbal directly to the base value last seen from other hosts. Original suggestion/patch and final testing from 张伟. --- diff --git a/src/paxos_lease.c b/src/paxos_lease.c index 5643739..2ad960d 100644 --- a/src/paxos_lease.c +++ b/src/paxos_lease.c @@ -481,7 +481,7 @@ static int verify_dblock(struct token *token, struct paxos_dblock *pd, uint32_t static int run_ballot(struct task *task, struct token *token, uint32_t flags, int num_hosts, uint64_t next_lver, uint64_t our_mbal, - struct paxos_dblock *dblock_out) + struct paxos_dblock *dblock_out, uint64_t *other_max_mbal) { char bk_debug[BK_DEBUG_SIZE]; char bk_str[BK_STR_SIZE]; @@ -520,7 +520,6 @@ static int run_ballot(struct task *task, struct token *token, uint32_t flags, return rv; } - /* * phase 1 * @@ -665,6 +664,7 @@ static int run_ballot(struct task *task, struct token *token, uint32_t flags, log_token(token, "ballot %llu phase1 read %s", (unsigned long long)next_lver, bk_debug); + *other_max_mbal = bk->mbal; error = SANLK_DBLOCK_MBAL; goto out; } @@ -884,6 +884,7 @@ static int run_ballot(struct task *task, struct token *token, uint32_t flags, log_token(token, "ballot %llu phase2 read %s", (unsigned long long)next_lver, bk_debug); + *other_max_mbal = bk->mbal; error = SANLK_DBLOCK_MBAL; goto out; } @@ -1695,6 +1696,7 @@ int paxos_lease_acquire(struct task *task, uint64_t max_mbal; uint64_t num_mbal; uint64_t our_mbal; + uint64_t other_max_mbal; int copy_cur_leader; int disk_open = 0; int error, rv, us; @@ -2121,7 +2123,9 @@ int paxos_lease_acquire(struct task *task, goto restart; } - error = run_ballot(task, token, flags, cur_leader.num_hosts, next_lver, our_mbal, &dblock); + other_max_mbal = 0; + + error = run_ballot(task, token, flags, cur_leader.num_hosts, next_lver, our_mbal, &dblock, &other_max_mbal); if ((error == SANLK_DBLOCK_MBAL) || (error == SANLK_DBLOCK_LVER)) { us = get_rand(0, 1000000); @@ -2132,7 +2136,33 @@ int paxos_lease_acquire(struct task *task, (unsigned long long)next_lver, us); usleep(us); - our_mbal += cur_leader.max_hosts; + + /* + * If other hosts have advanced max_mbal far beyond where we're at, then + * skip ahead to the base max_mbal seen in the last ballot, rather than + * adding just adding max_hosts to our_mbal repeatedly until we catch up. + * (Note: run_ballot returns the first larger mbal it finds, and does not + * scan all dblocks for the largest mbal that exists. So this doesn't + * always skip ahead as far as it could. All dblocks could be scanned + * to find the largest mbal, but it would be a bigger code change.) + */ + if (other_max_mbal > (our_mbal + (2 * cur_leader.max_hosts))) { + uint64_t other_host_id, other_base_mbal, our_next_mbal; + other_host_id = other_max_mbal % cur_leader.max_hosts; + other_base_mbal = other_max_mbal - other_host_id; + our_next_mbal = other_base_mbal + token->host_id; + + log_warnt(token, "paxos_acquire %llu other_max_mbal %llu advance our_mbal from %llu to %llu", + (unsigned long long)next_lver, + (unsigned long long)other_max_mbal, + (unsigned long long)our_mbal, + (unsigned long long)our_next_mbal); + + our_mbal = our_next_mbal; + } else { + our_mbal += cur_leader.max_hosts; + } + goto retry_ballot; }