From 34a0af6017e91cfe3370ecae1c06ce5391afea6a Mon Sep 17 00:00:00 2001 From: David Teigland Date: Feb 02 2018 17:02:14 +0000 Subject: sanlock: add resource index --- diff --git a/src/Makefile b/src/Makefile index 9e914d9..1ef6852 100644 --- a/src/Makefile +++ b/src/Makefile @@ -33,6 +33,7 @@ CMD_SOURCE = \ task.c \ timeouts.c \ resource.c \ + rindex.c \ watchdog.c \ monotime.c \ cmd.c \ @@ -48,6 +49,7 @@ LIB_ENTIRE_SOURCE = \ ondisk.c \ delta_lease.c \ paxos_lease.c \ + rindex.c \ direct.c \ task.c \ timeouts.c \ diff --git a/src/client.c b/src/client.c index fce6c60..21c90c3 100644 --- a/src/client.c +++ b/src/client.c @@ -1537,6 +1537,286 @@ int sanlock_get_lvb(uint32_t flags, struct sanlk_resource *res, char *lvb, int l return rv; } +int sanlock_format_rindex(struct sanlk_rindex *rx, uint32_t flags) +{ + int rv, fd; + + if (!rx || !rx->lockspace_name[0] || !rx->disk.path[0]) + return -EINVAL; + + rv = connect_socket(&fd); + if (rv < 0) + return rv; + + rv = send_header(fd, SM_CMD_FORMAT_RINDEX, flags, + sizeof(struct sanlk_rindex), 0, 0); + if (rv < 0) + goto out; + + rv = send_data(fd, rx, sizeof(struct sanlk_rindex), 0); + if (rv < 0) { + rv = -errno; + goto out; + } + + rv = recv_result(fd); + out: + close(fd); + return rv; +} + +int sanlock_rebuild_rindex(struct sanlk_rindex *rx, uint32_t flags) +{ + int rv, fd; + + if (!rx || !rx->lockspace_name[0] || !rx->disk.path[0]) + return -EINVAL; + + rv = connect_socket(&fd); + if (rv < 0) + return rv; + + rv = send_header(fd, SM_CMD_REBUILD_RINDEX, flags, + sizeof(struct sanlk_rindex), 0, 0); + if (rv < 0) + goto out; + + rv = send_data(fd, rx, sizeof(struct sanlk_rindex), 0); + if (rv < 0) { + rv = -errno; + goto out; + } + + rv = recv_result(fd); + out: + close(fd); + return rv; +} + +int sanlock_update_rindex(struct sanlk_rindex *rx, uint32_t flags, + struct sanlk_rentry *re) +{ + struct sanlk_rentry re_recv; + int rv, fd; + + memset(&re_recv, 0, sizeof(re_recv)); + + if (!rx || !rx->lockspace_name[0] || !rx->disk.path[0] || !re) + return -EINVAL; + + rv = connect_socket(&fd); + if (rv < 0) + return rv; + + rv = send_header(fd, SM_CMD_UPDATE_RINDEX, flags, + sizeof(struct sanlk_rindex) + + sizeof(struct sanlk_rentry), + 0, 0); + if (rv < 0) + goto out; + + rv = send_data(fd, rx, sizeof(struct sanlk_rindex), 0); + if (rv < 0) { + rv = -errno; + goto out; + } + + rv = send_data(fd, re, sizeof(struct sanlk_rentry), 0); + if (rv < 0) { + rv = -1; + goto out; + } + + rv = recv_result(fd); + if (rv < 0) + goto out; + + rv = recv_data(fd, &re_recv, sizeof(struct sanlk_rentry), MSG_WAITALL); + if (rv < 0) { + rv = -errno; + goto out; + } + + if (rv != sizeof(struct sanlk_rentry)) { + rv = -1; + goto out; + } + + memcpy(re, &re_recv, sizeof(struct sanlk_rentry)); + rv = 0; + out: + close(fd); + return rv; +} + +int sanlock_lookup_rindex(struct sanlk_rindex *rx, uint32_t flags, + struct sanlk_rentry *re) +{ + struct sanlk_rentry re_recv; + int rv, fd; + + memset(&re_recv, 0, sizeof(re_recv)); + + if (!rx || !rx->lockspace_name[0] || !rx->disk.path[0] || !re) + return -EINVAL; + + rv = connect_socket(&fd); + if (rv < 0) + return rv; + + rv = send_header(fd, SM_CMD_LOOKUP_RINDEX, flags, + sizeof(struct sanlk_rindex) + + sizeof(struct sanlk_resource), + 0, 0); + if (rv < 0) + goto out; + + rv = send_data(fd, rx, sizeof(struct sanlk_rindex), 0); + if (rv < 0) { + rv = -errno; + goto out; + } + + rv = send_data(fd, re, sizeof(struct sanlk_rentry), 0); + if (rv < 0) { + rv = -1; + goto out; + } + + rv = recv_result(fd); + if (rv < 0) + goto out; + + rv = recv_data(fd, &re_recv, sizeof(struct sanlk_rentry), MSG_WAITALL); + if (rv < 0) { + rv = -errno; + goto out; + } + + if (rv != sizeof(struct sanlk_rentry)) { + rv = -1; + goto out; + } + + memcpy(re, &re_recv, sizeof(struct sanlk_rentry)); + rv = 0; + out: + close(fd); + return rv; +} + +int sanlock_create_resource(struct sanlk_rindex *rx, uint32_t flags, + struct sanlk_rentry *re, + int max_hosts, int num_hosts) +{ + struct sanlk_rentry re_recv; + int rv, fd; + + memset(&re_recv, 0, sizeof(re_recv)); + + if (!rx || !rx->lockspace_name[0] || !rx->disk.path[0] || !re) + return -EINVAL; + + rv = connect_socket(&fd); + if (rv < 0) + return rv; + + rv = send_header(fd, SM_CMD_CREATE_RESOURCE, flags, + sizeof(struct sanlk_rindex) + + sizeof(struct sanlk_rentry), + max_hosts, num_hosts); + if (rv < 0) + goto out; + + rv = send_data(fd, rx, sizeof(struct sanlk_rindex), 0); + if (rv < 0) { + rv = -errno; + goto out; + } + + rv = send_data(fd, re, sizeof(struct sanlk_rentry), 0); + if (rv < 0) { + rv = -1; + goto out; + } + + rv = recv_result(fd); + if (rv < 0) + goto out; + + rv = recv_data(fd, &re_recv, sizeof(struct sanlk_rentry), MSG_WAITALL); + if (rv < 0) { + rv = -errno; + goto out; + } + + if (rv != sizeof(struct sanlk_rentry)) { + rv = -1; + goto out; + } + + memcpy(re, &re_recv, sizeof(struct sanlk_rentry)); + rv = 0; + out: + close(fd); + return rv; +} + +int sanlock_delete_resource(struct sanlk_rindex *rx, uint32_t flags, + struct sanlk_rentry *re) +{ + struct sanlk_rentry re_recv; + int rv, fd; + + memset(&re_recv, 0, sizeof(re_recv)); + + if (!rx || !rx->lockspace_name[0] || !rx->disk.path[0] || !re) + return -EINVAL; + + rv = connect_socket(&fd); + if (rv < 0) + return rv; + + rv = send_header(fd, SM_CMD_DELETE_RESOURCE, flags, + sizeof(struct sanlk_rindex) + + sizeof(struct sanlk_rentry), + 0, 0); + if (rv < 0) + goto out; + + rv = send_data(fd, rx, sizeof(struct sanlk_rindex), 0); + if (rv < 0) { + rv = -errno; + goto out; + } + + rv = send_data(fd, re, sizeof(struct sanlk_rentry), 0); + if (rv < 0) { + rv = -1; + goto out; + } + + rv = recv_result(fd); + if (rv < 0) + goto out; + + rv = recv_data(fd, &re_recv, sizeof(struct sanlk_rentry), MSG_WAITALL); + if (rv < 0) { + rv = -errno; + goto out; + } + + if (rv != sizeof(struct sanlk_rentry)) { + rv = -1; + goto out; + } + + rv = 0; + out: + close(fd); + return rv; +} + /* * src may have colons/spaces escaped (with backslash) or unescaped. * if unescaped colons/spaces are found, insert backslash before them. diff --git a/src/cmd.c b/src/cmd.c index fa90482..d3ce8f3 100644 --- a/src/cmd.c +++ b/src/cmd.c @@ -46,6 +46,7 @@ #include "direct.h" #include "task.h" #include "cmd.h" +#include "rindex.h" /* from main.c */ void client_resume(int ci); @@ -2027,6 +2028,118 @@ reply: client_resume(ca->ci_in); } +static void cmd_format_rindex(struct task *task, struct cmd_args *ca) +{ + struct sanlk_rindex ri; + int fd, rv, result; + + fd = client[ca->ci_in].fd; + + rv = recv(fd, &ri, sizeof(struct sanlk_rindex), MSG_WAITALL); + if (rv != sizeof(struct sanlk_rindex)) { + log_error("cmd_format_rindex %d,%d recv %d %d", + ca->ci_in, fd, rv, errno); + result = -ENOTCONN; + goto reply; + } + + log_debug("cmd_format_rindex %d,%d %.48s %s:%llu", + ca->ci_in, fd, ri.lockspace_name, + ri.disk.path, + (unsigned long long)ri.disk.offset); + + result = rindex_format(task, &ri); + reply: + log_debug("cmd_format_rindex %d,%d done %d", ca->ci_in, fd, result); + + send_result(fd, &ca->header, result); + client_resume(ca->ci_in); +} + +static void cmd_rebuild_rindex(struct task *task, struct cmd_args *ca) +{ + struct sanlk_rindex ri; + int fd, rv, result; + + fd = client[ca->ci_in].fd; + + rv = recv(fd, &ri, sizeof(struct sanlk_rindex), MSG_WAITALL); + if (rv != sizeof(struct sanlk_rindex)) { + log_error("cmd_rebuild_rindex %d,%d recv %d %d", + ca->ci_in, fd, rv, errno); + result = -ENOTCONN; + goto reply; + } + + log_debug("cmd_rebuild_rindex %d,%d %.48s %s:%llu", + ca->ci_in, fd, ri.lockspace_name, + ri.disk.path, + (unsigned long long)ri.disk.offset); + + result = rindex_rebuild(task, &ri, ca->header.cmd_flags); + reply: + log_debug("cmd_rebuild_rindex %d,%d done %d", ca->ci_in, fd, result); + + send_result(fd, &ca->header, result); + client_resume(ca->ci_in); +} + +static void rindex_op(struct task *task, struct cmd_args *ca, const char *cmd, int op) +{ + struct sanlk_rindex ri; + struct sanlk_rentry re; + struct sanlk_rentry re_ret; + struct sm_header h; + int fd, rv, result; + + memset(&re_ret, 0, sizeof(re_ret)); + + fd = client[ca->ci_in].fd; + + rv = recv(fd, &ri, sizeof(struct sanlk_rindex), MSG_WAITALL); + if (rv != sizeof(struct sanlk_rindex)) { + log_error("%s %d,%d recv %d %d", cmd, ca->ci_in, fd, rv, errno); + result = -ENOTCONN; + goto reply; + } + + rv = recv(fd, &re, sizeof(struct sanlk_rentry), MSG_WAITALL); + if (rv != sizeof(struct sanlk_rentry)) { + log_error("%s %d,%d recv %d %d", cmd, ca->ci_in, fd, rv, errno); + result = -ENOTCONN; + goto reply; + } + + log_debug("%s %d,%d %.48s %s:%llu", cmd, + ca->ci_in, fd, ri.lockspace_name, + ri.disk.path, + (unsigned long long)ri.disk.offset); + + if (op == RX_OP_LOOKUP) + result = rindex_lookup(task, &ri, &re, &re_ret, ca->header.cmd_flags); + else if (op == RX_OP_UPDATE) + result = rindex_update(task, &ri, &re, &re_ret, ca->header.cmd_flags); + else if (op == RX_OP_CREATE) + result = rindex_create(task, &ri, &re, &re_ret, ca->header.data, ca->header.data2); + else if (op == RX_OP_DELETE) + result = rindex_delete(task, &ri, &re, &re_ret); + else + result = -EINVAL; + + reply: + log_debug("%s %d,%d done %d", cmd, ca->ci_in, fd, result); + + memcpy(&h, &ca->header, sizeof(struct sm_header)); + h.version = SM_PROTO; + h.data = result; + h.data2 = 0; + h.length = sizeof(h) + sizeof(re_ret); + send(fd, &h, sizeof(h), MSG_NOSIGNAL); + send(fd, &re_ret, sizeof(re), MSG_NOSIGNAL); + + client_resume(ca->ci_in); +} + void call_cmd_thread(struct task *task, struct cmd_args *ca) { switch (ca->header.cmd) { @@ -2094,6 +2207,24 @@ void call_cmd_thread(struct task *task, struct cmd_args *ca) case SM_CMD_SET_EVENT: cmd_set_event(task, ca); break; + case SM_CMD_FORMAT_RINDEX: + cmd_format_rindex(task, ca); + break; + case SM_CMD_REBUILD_RINDEX: + cmd_rebuild_rindex(task, ca); + break; + case SM_CMD_UPDATE_RINDEX: + rindex_op(task, ca, "cmd_update_rindex", RX_OP_UPDATE); + break; + case SM_CMD_LOOKUP_RINDEX: + rindex_op(task, ca, "cmd_lookup_rindex", RX_OP_LOOKUP); + break; + case SM_CMD_CREATE_RESOURCE: + rindex_op(task, ca, "cmd_create_resource", RX_OP_CREATE); + break; + case SM_CMD_DELETE_RESOURCE: + rindex_op(task, ca, "cmd_delete_resource", RX_OP_DELETE); + break; }; } diff --git a/src/direct.c b/src/direct.c index 0633f75..dd8bff9 100644 --- a/src/direct.c +++ b/src/direct.c @@ -23,6 +23,7 @@ #include #include "sanlock_internal.h" +#include "sanlock_admin.h" #include "diskio.h" #include "ondisk.h" #include "log.h" @@ -31,6 +32,7 @@ #include "paxos_lease.h" #include "delta_lease.h" #include "timeouts.h" +#include "rindex.h" static int direct_read_leader_sector_size(struct task *task, struct sync_disk *sd) { @@ -496,6 +498,13 @@ int direct_dump(struct task *task, char *dump_path, int force_mode) { char *data, *bitmap; char *colon, *off_str; + uint32_t magic; + struct rindex_header *rh_end; + struct rindex_header *rh; + struct rindex_header rh_in; + struct rindex_entry *re_end; + struct rindex_entry *re; + struct rindex_entry re_in; struct leader_record *lr_end; struct leader_record *lr; struct leader_record lr_in; @@ -509,7 +518,7 @@ int direct_dump(struct task *task, char *dump_path, int force_mode) uint64_t dump_size = 0; uint64_t end_sector_nr; int sector_size, sector_count, datalen, align_size; - int i, rv, b; + int i, j, rv, b; memset(&sd, 0, sizeof(struct sync_disk)); @@ -570,12 +579,13 @@ int direct_dump(struct task *task, char *dump_path, int force_mode) rv = read_sectors(&sd, sector_size, sector_nr, sector_count, data, datalen, task, DEFAULT_IO_TIMEOUT, "dump"); - lr_end = (struct leader_record *)data; + magic_in(data, &magic); - leader_record_in(lr_end, &lr_in); - lr = &lr_in; + if (magic == DELTA_DISK_MAGIC) { + lr_end = (struct leader_record *)data; + leader_record_in(lr_end, &lr_in); + lr = &lr_in; - if (lr->magic == DELTA_DISK_MAGIC) { for (i = 0; i < sector_count; i++) { lr_end = (struct leader_record *)(data + (i * sector_size)); @@ -608,7 +618,11 @@ int direct_dump(struct task *task, char *dump_path, int force_mode) } printf("\n"); } - } else if (lr->magic == PAXOS_DISK_MAGIC) { + } else if (magic == PAXOS_DISK_MAGIC) { + lr_end = (struct leader_record *)data; + leader_record_in(lr_end, &lr_in); + lr = &lr_in; + strncpy(sname, lr->space_name, NAME_ID_SIZE); strncpy(rname, lr->resource_name, NAME_ID_SIZE); @@ -656,11 +670,47 @@ int direct_dump(struct task *task, char *dump_path, int force_mode) printf(" "); printf("%04u %04llu SH\n", i+1, (unsigned long long)mb.generation); } + } else if (magic == RINDEX_DISK_MAGIC) { + rh_end = (struct rindex_header *)data; + rindex_header_in(rh_end, &rh_in); + rh = &rh_in; + + strncpy(sname, rh->lockspace_name, NAME_ID_SIZE); + + printf("%08llu %36s rindex\n", + (unsigned long long)(sector_nr * sector_size), + sname); + + if (!force_mode) + goto next; + + /* i begins with 1 to skip the first sector of the rindex which holds the header */ + + for (i = 1; i < sector_count; i++) { + int entry_size = sizeof(struct rindex_entry); + int entries_per_sector = sector_size / entry_size; + + for (j = 0; j < entries_per_sector; j++) { + re_end = (struct rindex_entry *)(data + (i * sector_size) + (j * entry_size)); + rindex_entry_in(re_end, &re_in); + re = &re_in; + + if (!re->res_offset && !re->name[0]) + continue; + + printf("%08llu %36s rentry %s %llu\n", + (unsigned long long)((sector_nr * sector_size) + (i * sector_size) + (j * entry_size)), + sname, + re->name, (unsigned long long)re->res_offset); + } + } + + } else { if (end_sector_nr == 0) break; } - + next: sector_nr += sector_count; } @@ -725,7 +775,7 @@ int direct_next_free(struct task *task, char *path) leader_record_in(lr_end, &lr); - if (lr.magic != DELTA_DISK_MAGIC && lr.magic != PAXOS_DISK_MAGIC) { + if (lr.magic != DELTA_DISK_MAGIC && lr.magic != PAXOS_DISK_MAGIC && lr.magic != RINDEX_DISK_MAGIC) { printf("%llu\n", (unsigned long long)(sector_nr * sector_size)); rv = 0; goto out_free; @@ -740,3 +790,43 @@ int direct_next_free(struct task *task, char *path) return rv; } + +int direct_rindex_format(struct task *task, struct sanlk_rindex *ri) +{ + return rindex_format(task, ri); +} + +int direct_rindex_rebuild(struct task *task, struct sanlk_rindex *ri, + uint32_t cmd_flags) +{ + return rindex_rebuild(task, ri, cmd_flags | SANLK_RX_NO_LOCKSPACE); +} + +int direct_rindex_lookup(struct task *task, struct sanlk_rindex *ri, + struct sanlk_rentry *re, uint32_t cmd_flags) +{ + struct sanlk_rentry re_ret; + int rv; + + rv = rindex_lookup(task, ri, re, &re_ret, cmd_flags | SANLK_RX_NO_LOCKSPACE); + + if (!rv) + memcpy(re, &re_ret, sizeof(re_ret)); + + return rv; +} + +int direct_rindex_update(struct task *task, struct sanlk_rindex *ri, + struct sanlk_rentry *re, uint32_t cmd_flags) +{ + struct sanlk_rentry re_ret; + int rv; + + rv = rindex_update(task, ri, re, &re_ret, cmd_flags | SANLK_RX_NO_LOCKSPACE); + + if (!rv) + memcpy(re, &re_ret, sizeof(re_ret)); + + return rv; +} + diff --git a/src/direct.h b/src/direct.h index 29d61a1..fe7b779 100644 --- a/src/direct.h +++ b/src/direct.h @@ -55,4 +55,12 @@ int direct_dump(struct task *task, char *dump_path, int force_mode); int direct_next_free(struct task *task, char *path); +int direct_rindex_format(struct task *task, struct sanlk_rindex *ri); +int direct_rindex_rebuild(struct task *task, struct sanlk_rindex *ri, + uint32_t cmd_flags); +int direct_rindex_lookup(struct task *task, struct sanlk_rindex *ri, + struct sanlk_rentry *re, uint32_t cmd_flags); +int direct_rindex_update(struct task *task, struct sanlk_rindex *ri, + struct sanlk_rentry *re, uint32_t cmd_flags); + #endif diff --git a/src/direct_lib.c b/src/direct_lib.c index b4065cc..7010abe 100644 --- a/src/direct_lib.c +++ b/src/direct_lib.c @@ -33,6 +33,18 @@ void log_level(uint32_t space_id GNUC_UNUSED, uint32_t res_id GNUC_UNUSED, { } +int lockspace_begin_rindex_op(char *space_name GNUC_UNUSED, int rindex_op GNUC_UNUSED, struct space_info *spi GNUC_UNUSED); +int lockspace_begin_rindex_op(char *space_name GNUC_UNUSED, int rindex_op GNUC_UNUSED, struct space_info *spi GNUC_UNUSED) +{ + return -1; +} + +int lockspace_clear_rindex_op(char *space_name GNUC_UNUSED); +int lockspace_clear_rindex_op(char *space_name GNUC_UNUSED) +{ + return -1; +} + int lockspace_disk(char *space_name GNUC_UNUSED, struct sync_disk *disk GNUC_UNUSED); int lockspace_disk(char *space_name GNUC_UNUSED, struct sync_disk *disk GNUC_UNUSED) diff --git a/src/lockspace.c b/src/lockspace.c index a82c117..fa7ec80 100644 --- a/src/lockspace.c +++ b/src/lockspace.c @@ -91,6 +91,18 @@ static struct space *find_lockspace_id(uint32_t space_id) return NULL; } +static void _set_space_info(struct space *sp, struct space_info *spi) +{ + /* keep this in sync with any new fields added to struct space_info */ + spi->space_id = sp->space_id; + spi->io_timeout = sp->io_timeout; + spi->sector_size = sp->sector_size; + spi->align_size = sp->align_size; + spi->host_id = sp->host_id; + spi->host_generation = sp->host_generation; + spi->killing_pids = sp->killing_pids; +} + int _lockspace_info(const char *space_name, struct space_info *spi) { struct space *sp; @@ -99,17 +111,7 @@ int _lockspace_info(const char *space_name, struct space_info *spi) if (strncmp(sp->space_name, space_name, NAME_ID_SIZE)) continue; - /* keep this in sync with any new fields added to - struct space_info */ - - spi->space_id = sp->space_id; - spi->io_timeout = sp->io_timeout; - spi->sector_size = sp->sector_size; - spi->align_size = sp->align_size; - spi->host_id = sp->host_id; - spi->host_generation = sp->host_generation; - spi->killing_pids = sp->killing_pids; - + _set_space_info(sp, spi); return 0; } return -1; @@ -1109,6 +1111,13 @@ int rem_lockspace_start(struct sanlk_lockspace *ls, unsigned int *space_id) goto out; } + if (sp->rindex_op) { + log_space(sp, "rem_lockspace ignored for rindex_op %d", sp->rindex_op); + pthread_mutex_unlock(&spaces_mutex); + rv = -EBUSY; + goto out; + } + /* * Removal happens in a round about way: * - we set external_remove @@ -1429,6 +1438,54 @@ int lockspace_set_config(struct sanlk_lockspace *ls, GNUC_UNUSED uint32_t flags, return rv; } +int lockspace_begin_rindex_op(char *space_name, int rindex_op, struct space_info *spi) +{ + struct space *sp; + int rv = 0; + + pthread_mutex_lock(&spaces_mutex); + sp = _search_space(space_name, NULL, 0, &spaces, NULL, NULL, NULL); + if (!sp) { + rv = -ENOENT; + goto out; + } + + /* space_dead and thread_stop are only set while + spaces_mutex is held, so we don't need to lock sp->mutex */ + + if (sp->space_dead || sp->thread_stop) { + rv = -ENOSPC; + goto out; + } + + if (sp->rindex_op) { + log_debug("being_rindex_op busy with %d", sp->rindex_op); + rv = -EBUSY; + goto out; + } + + sp->rindex_op = rindex_op; + _set_space_info(sp, spi); +out: + pthread_mutex_unlock(&spaces_mutex); + return rv; +} + +int lockspace_clear_rindex_op(char *space_name) +{ + struct space *sp; + int rv = 0; + + pthread_mutex_lock(&spaces_mutex); + sp = _search_space(space_name, NULL, 0, &spaces, NULL, NULL, NULL); + if (!sp) + rv = -ENOENT; + else + sp->rindex_op = 0; + pthread_mutex_unlock(&spaces_mutex); + return rv; +} + static int _clean_event_fds(struct space *sp) { uint32_t end; @@ -1695,6 +1752,78 @@ static int stop_lockspace_thread(struct space *sp, int wait) return rv; } +/* + * locking/lifetime rules for a struct space + * + * multiple factors: + * . spaces_mutex + * . sp->mutex + * . the specific thread: main daemon thread, lockspace thread, worker thread + * + * spaces, spaces_add, spaces_rem lists are protected by spaces_mutex + * + * sp->mutex protects info that is exchanged between the lockspace thread + * (for the sp) and the main thread. This is primarily sp->thread_stop, + * and sp->lease_status (although it seems a couple other bits of info + * have been added over time that are communicated between the lockspace + * thread and the main thread). + * + * add_lockspace_start(), called by worker thread, creates sp, + * adds it to spaces_add list under spaces_mutex, creates lockspace_thread + * for the sp. + * + * lockspace_thread never has to worry about sp going away and can access + * sp directly any time. The sp will not be freed until lockspace_thread + * has exited. + * + * The main thread never has to worry about sp going away, because the + * main thread is the only context in which sp structs are freed + * (and that only happens in free_lockspaces). + * + * add_lockspace_wait(), called by worker thread, can access sp directly + * because sp won't go away while it's on spaces_add. Only add_lockspace_wait + * can do something with sp while it's on spaces_add. _wait uses sp->mutex + * to exchange lease status with lockspace_thread. Once the host_id lease + * is acquired, _wait moves sp from spaces_add to spaces under spaces_mutex. + * After sp is moved to spaces list, its lifetime is owned by the main thread. + * + * While sp is on spaces list, its lifetime is controlled by the main thread. + * Apart from lockspace_thread, any other thread, e.g. worker thread, must + * lock spaces_mutex, look up sp on spaces list, access sp fields, then unlock + * spaces_mutex. After releasing spaces_mutex, it can't access sp struct + * because the main thread could dispose of it. If the worker thread wants + * to look at info that's being updated by the lockspace_thread, it should + * also take sp->mutex before copying it. + * + * I currently see some violations of proper sp access that should be fixed. + * The bad pattern in each case is: lock spaces_mutex, find sp, + * unlock spaces_mutex, lock sp->mutex. The sp could in theory go away between + * unlock spaces_mutex and lock sp->mutex. (In practice this would + * likely never happen.) + * + * . worker_thread lockspace_set_event() + * (reg_event and end_event are ok since they are called from + * the main thread) + * + * . worker_thread host_status_set_bit() + * + * . resource_thread send_event_callbacks() does the same. + * + * I'm not sure what the best solution would be: lock sp->mutex before + * unlocking spaces_mutex? Do everything under spaces_mutex? Add + * a simple ref count to sp for these cases of using sp from other + * threads? + * + * cmd_rem_lockspace() is run by a worker_thread. rem_lockspace_start() + * locks spaces_mutex, finds sp, sets sp->external_remove, unlocks spaces_mutex. + * Then the main thread, which owns the sp structs, sees sp->external_remove, + * kills any pids using the sp, and when the sp is no longer used, it sets + * sp->thread_stop, and moves sp from spaces list to spaces_rem list. + * The main thread then runs free_lockspaces() which stops the lockspace_thread + * for sp's on spaces_rem. When the lockspace_thread exits, the main thread + * then removes sp from spaces_rem and frees sp. + */ + void free_lockspaces(int wait) { struct space *sp, *safe; diff --git a/src/lockspace.h b/src/lockspace.h index f833efe..0c3cba3 100644 --- a/src/lockspace.h +++ b/src/lockspace.h @@ -80,4 +80,7 @@ int send_event_callbacks(uint32_t space_id, uint64_t from_host_id, uint64_t from /* locks spaces_mutex, locks sp */ int lockspace_set_config(struct sanlk_lockspace *ls, uint32_t flags, uint32_t cmd); +int lockspace_begin_rindex_op(char *space_name, int rindex_op, struct space_info *spi); +int lockspace_clear_rindex_op(char *space_name); + #endif diff --git a/src/main.c b/src/main.c index 9340ef7..f60b4d3 100644 --- a/src/main.c +++ b/src/main.c @@ -1245,6 +1245,12 @@ static void process_connection(int ci) case SM_CMD_GET_LVB: case SM_CMD_SHUTDOWN_WAIT: case SM_CMD_SET_EVENT: + case SM_CMD_FORMAT_RINDEX: + case SM_CMD_REBUILD_RINDEX: + case SM_CMD_UPDATE_RINDEX: + case SM_CMD_LOOKUP_RINDEX: + case SM_CMD_CREATE_RESOURCE: + case SM_CMD_DELETE_RESOURCE: rv = client_suspend(ci); if (rv < 0) return; @@ -1753,6 +1759,97 @@ static int group_to_gid(char *arg) return gr->gr_gid; } +static int parse_arg_rentry(char *str) +{ + char *name = NULL; + char *offset = NULL; + + if (!str) + return -EINVAL; + + /* "-r :1M" can be used to specify only an offset */ + if (str[0] != ':') + name = str; + + if ((offset = strchr(str, ':'))) { + uint64_t offnum; + char *m ; + + *offset = '\0'; + offset++; + + if ((m = strchr(offset, 'M'))) { + *m = '\0'; + offnum = atoll(offset) * 1024 * 1024; + } else { + offnum = atoll(offset); + } + com.rentry.offset = offnum; + } + + if (name) + strncpy(com.rentry.name, name, SANLK_NAME_LEN); + + return 0; +} + +static int parse_arg_rindex(char *str) +{ + char *ls_name = NULL; + char *path = NULL; + char *offset = NULL; + int i; + + if (!str) + return -EINVAL; + + ls_name = &str[0]; + + for (i = 0; i < strlen(str); i++) { + if (str[i] == '\\') { + i++; + continue; + } + + if (str[i] == ':') { + if (!path) + path = &str[i]; + else if (!offset) + offset = &str[i]; + } + } + + if (path) { + *path = '\0'; + path++; + } + if (offset) { + *offset= '\0'; + offset++; + } + + if (ls_name) + strncpy(com.rindex.lockspace_name, ls_name, SANLK_NAME_LEN); + + if (path) + sanlock_path_import(com.rindex.disk.path, path, sizeof(com.rindex.disk.path)); + + if (offset) { + uint64_t offnum; + char *m ; + + if ((m = strchr(offset, 'M'))) { + *m = '\0'; + offnum = atoll(offset) * 1024 * 1024; + } else { + offnum = atoll(offset); + } + com.rindex.disk.offset = offnum; + } + + return 0; +} + static int parse_arg_lockspace(char *arg) { sanlock_str_to_lockspace(arg, &com.lockspace); @@ -1860,11 +1957,21 @@ static void print_usage(void) printf("sanlock client inquire -p \n"); printf("sanlock client request -r RESOURCE -f \n"); printf("sanlock client examine -r RESOURCE | -s LOCKSPACE\n"); + printf("sanlock client format -x RINDEX\n"); + printf("sanlock client create -x RINDEX -e \n"); + printf("sanlock client delete -x RINDEX -e [:]\n"); + printf("sanlock client lookup -x RINDEX [-e :]\n"); + printf("sanlock client update -x RINDEX -e [:] [-z 0|1]\n"); + printf("sanlock client rebuild -x RINDEX\n"); printf("\n"); printf("sanlock direct [-a 0|1] [-o 0|1] [-Z 512|4096]\n"); printf("sanlock direct init -s LOCKSPACE | -r RESOURCE\n"); printf("sanlock direct read_leader -s LOCKSPACE | -r RESOURCE\n"); printf("sanlock direct dump [:[:]]\n"); + printf("sanlock direct format -x RINDEX\n"); + printf("sanlock direct lookup -x RINDEX [-e :]\n"); + printf("sanlock direct update -x RINDEX -e [:] [-z 0|1]\n"); + printf("sanlock direct rebuild -x RINDEX\n"); printf("\n"); printf("LOCKSPACE = :::\n"); printf(" name of lockspace\n"); @@ -1879,6 +1986,11 @@ static void print_usage(void) printf(" offset on path (bytes)\n"); printf(" optional leader version or SH for shared lease\n"); printf("\n"); + printf("RINDEX = ::\n"); + printf(" name of lockspace\n"); + printf(" path to storage reserved for leases\n"); + printf(" offset on path (bytes)\n"); + printf("\n"); printf("Limits:\n"); printf("offset alignment with 512 byte sectors: %d (1MB)\n", 1024 * 1024); printf("offset alignment with 4096 byte sectors: %d (8MB)\n", 1024 * 1024 * 8); @@ -1993,7 +2105,25 @@ static int read_command_line(int argc, char *argv[]) com.action = ACT_SET_EVENT; else if (!strcmp(act, "set_config")) com.action = ACT_SET_CONFIG; - else { + else if (!strcmp(act, "format")) { + com.action = ACT_FORMAT; + com.rindex_op = RX_OP_FORMAT; + } else if (!strcmp(act, "rebuild")) { + com.action = ACT_REBUILD; + com.rindex_op = RX_OP_REBUILD; + } else if (!strcmp(act, "create")) { + com.action = ACT_CREATE; + com.rindex_op = RX_OP_CREATE; + } else if (!strcmp(act, "delete")) { + com.action = ACT_DELETE; + com.rindex_op = RX_OP_DELETE; + } else if (!strcmp(act, "lookup")) { + com.action = ACT_LOOKUP; + com.rindex_op = RX_OP_LOOKUP; + } else if (!strcmp(act, "update")) { + com.action = ACT_UPDATE; + com.rindex_op = RX_OP_UPDATE; + } else { log_tool("client action \"%s\" is unknown", act); exit(EXIT_FAILURE); } @@ -2020,7 +2150,19 @@ static int read_command_line(int argc, char *argv[]) com.action = ACT_RELEASE_ID; else if (!strcmp(act, "renew_id")) com.action = ACT_RENEW_ID; - else { + else if (!strcmp(act, "format")) { + com.action = ACT_FORMAT; + com.rindex_op = RX_OP_FORMAT; + } else if (!strcmp(act, "rebuild")) { + com.action = ACT_REBUILD; + com.rindex_op = RX_OP_REBUILD; + } else if (!strcmp(act, "lookup")) { + com.action = ACT_LOOKUP; + com.rindex_op = RX_OP_LOOKUP; + } else if (!strcmp(act, "update")) { + com.action = ACT_UPDATE; + com.rindex_op = RX_OP_UPDATE; + } else { log_tool("direct action \"%s\" is unknown", act); exit(EXIT_FAILURE); } @@ -2130,8 +2272,12 @@ static int read_command_line(int argc, char *argv[]) com.he_data = strtoull(optionarg, NULL, 0); break; case 'e': - strncpy(com.our_host_name, optionarg, NAME_ID_SIZE); - com.he_event = strtoull(optionarg, NULL, 0); + if (com.rindex_op) { + parse_arg_rentry(optionarg); + } else { + strncpy(com.our_host_name, optionarg, NAME_ID_SIZE); + com.he_event = strtoull(optionarg, NULL, 0); + } break; case 'i': com.host_id = strtoull(optionarg, NULL, 0); @@ -2173,6 +2319,9 @@ static int read_command_line(int argc, char *argv[]) com.used_set = 1; com.used = atoi(optionarg); break; + case 'x': + parse_arg_rindex(optionarg); + break; case 'z': com.clear_arg = 1; break; @@ -2862,6 +3011,49 @@ static int do_client(void) log_tool("set_config done %d", rv); break; + case ACT_FORMAT: + if (com.sector_size == 512) + com.rindex.flags |= SANLK_RIF_ALIGN1M; + else if (com.sector_size == 4096) + com.rindex.flags |= SANLK_RIF_ALIGN8M; + + rv = sanlock_format_rindex(&com.rindex, 0); + log_tool("format done %d", rv); + break; + + case ACT_REBUILD: + rv = sanlock_rebuild_rindex(&com.rindex, 0); + log_tool("rebuild done %d", rv); + break; + + case ACT_CREATE: + rv = sanlock_create_resource(&com.rindex, 0, &com.rentry, 0, 0); + log_tool("create_resource done %d", rv); + if (!rv) + log_tool("offset %llu", (unsigned long long)com.rentry.offset); + break; + + case ACT_DELETE: + rv = sanlock_delete_resource(&com.rindex, 0, &com.rentry); + log_tool("delete_resource done %d", rv); + break; + + case ACT_LOOKUP: + rv = sanlock_lookup_rindex(&com.rindex, 0, &com.rentry); + log_tool("lookup done %d", rv); + if (!rv) + log_tool("name %s offset %llu", + com.rentry.name[0] ? com.rentry.name : "-", + (unsigned long long)com.rentry.offset); + break; + + case ACT_UPDATE: + rv = sanlock_update_rindex(&com.rindex, + com.clear_arg ? SANLK_RXUP_REM : SANLK_RXUP_ADD, + &com.rentry); + log_tool("update done %d", rv); + break; + default: log_tool("action not implemented"); rv = -1; @@ -3122,6 +3314,7 @@ static int do_direct_init(void) static int do_direct(void) { struct leader_record leader; + uint32_t cmd_flags = 0; int rv; /* we want a record of any out-of-band changes to disk */ @@ -3152,6 +3345,34 @@ static int do_direct(void) rv = do_direct_write_leader(); break; + case ACT_FORMAT: + rv = direct_rindex_format(&main_task, &com.rindex); + log_tool("format done %d", rv); + break; + + case ACT_REBUILD: + rv = direct_rindex_rebuild(&main_task, &com.rindex, 0); + log_tool("rebuild done %d", rv); + break; + + case ACT_LOOKUP: + rv = direct_rindex_lookup(&main_task, &com.rindex, &com.rentry, 0); + log_tool("lookup done %d", rv); + if (!rv) + log_tool("name %s offset %llu", + com.rentry.name[0] ? com.rentry.name : "-", + (unsigned long long)com.rentry.offset); + break; + + case ACT_UPDATE: + if (com.clear_arg) + cmd_flags |= SANLK_RXUP_REM; + else + cmd_flags |= SANLK_RXUP_ADD; + rv = direct_rindex_update(&main_task, &com.rindex, &com.rentry, cmd_flags); + log_tool("update done %d", rv); + break; + case ACT_ACQUIRE: syslog(LOG_WARNING, "acquire"); rv = direct_acquire(&main_task, com.io_timeout_arg, diff --git a/src/ondisk.c b/src/ondisk.c index 21f9b64..c703521 100644 --- a/src/ondisk.c +++ b/src/ondisk.c @@ -20,6 +20,15 @@ * "end" variables point to ondisk format (endian converted) structures. */ +void magic_in(char *end, uint32_t *magic) +{ + uint32_t magic_end; + + memcpy(&magic_end, end, sizeof(uint32_t)); + + *magic = le32_to_cpu(magic_end); +} + void leader_record_in(struct leader_record *end, struct leader_record *lr) { lr->magic = le32_to_cpu(end->magic); @@ -120,3 +129,43 @@ void mode_block_out(struct mode_block *mb, struct mode_block *end) end->generation = cpu_to_le64(mb->generation); } +void rindex_header_in(struct rindex_header *end, struct rindex_header *rh) +{ + rh->magic = le32_to_cpu(end->magic); + rh->version = le32_to_cpu(end->version); + rh->flags = le32_to_cpu(end->flags); + rh->sector_size = le32_to_cpu(end->sector_size); + rh->max_resources = le32_to_cpu(end->max_resources); + rh->unused = le32_to_cpu(end->unused); + rh->rx_offset = le64_to_cpu(end->rx_offset); + memcpy(rh->lockspace_name, end->lockspace_name, NAME_ID_SIZE); +} + +void rindex_header_out(struct rindex_header *rh, struct rindex_header *end) +{ + end->magic = cpu_to_le32(rh->magic); + end->version = cpu_to_le32(rh->version); + end->flags = cpu_to_le32(rh->flags); + end->sector_size = cpu_to_le32(rh->sector_size); + end->max_resources = cpu_to_le32(rh->max_resources); + end->unused = cpu_to_le32(rh->unused); + end->rx_offset = cpu_to_le64(rh->rx_offset); + memcpy(end->lockspace_name, rh->lockspace_name, NAME_ID_SIZE); +} + +void rindex_entry_in(struct rindex_entry *end, struct rindex_entry *re) +{ + re->res_offset = le64_to_cpu(end->res_offset); + re->flags = le32_to_cpu(end->flags); + re->unused = le32_to_cpu(end->unused); + memcpy(re->name, end->name, NAME_ID_SIZE); +} + +void rindex_entry_out(struct rindex_entry *re, struct rindex_entry *end) +{ + end->res_offset = cpu_to_le64(re->res_offset); + end->flags = cpu_to_le32(re->flags); + end->unused = cpu_to_le32(re->unused); + memcpy(end->name, re->name, NAME_ID_SIZE); +} + diff --git a/src/ondisk.h b/src/ondisk.h index 3ae4833..9f7768b 100644 --- a/src/ondisk.h +++ b/src/ondisk.h @@ -35,6 +35,7 @@ #define cpu_to_le64(x) (x) #endif +void magic_in(char *end, uint32_t *magic); void leader_record_in(struct leader_record *end, struct leader_record *lr); void leader_record_out(struct leader_record *lr, struct leader_record *end); void request_record_in(struct request_record *end, struct request_record *rr); @@ -43,5 +44,9 @@ void paxos_dblock_in(struct paxos_dblock *end, struct paxos_dblock *pd); void paxos_dblock_out(struct paxos_dblock *pd, struct paxos_dblock *end); void mode_block_in(struct mode_block *end, struct mode_block *mb); void mode_block_out(struct mode_block *mb, struct mode_block *end); +void rindex_header_in(struct rindex_header *end, struct rindex_header *rh); +void rindex_header_out(struct rindex_header *rh, struct rindex_header *end); +void rindex_entry_in(struct rindex_entry *end, struct rindex_entry *re); +void rindex_entry_out(struct rindex_entry *re, struct rindex_entry *end); #endif diff --git a/src/paxos_lease.c b/src/paxos_lease.c index 2303873..ed16f41 100644 --- a/src/paxos_lease.c +++ b/src/paxos_lease.c @@ -1039,6 +1039,7 @@ int paxos_read_resource(struct task *task, { struct leader_record leader; uint32_t checksum; + int tmp_sector_size = 0; int rv; memset(&leader, 0, sizeof(struct leader_record)); @@ -1052,32 +1053,46 @@ int paxos_read_resource(struct task *task, if (!token->sector_size) { token->sector_size = 4096; token->align_size = sector_size_to_align_size(4096); + tmp_sector_size = 1; } rv = read_leader(task, token, &token->disks[0], &leader, &checksum); if (rv < 0) return rv; - token->sector_size = leader.sector_size; - token->align_size = sector_size_to_align_size(leader.sector_size); - if (!res->lockspace_name[0]) memcpy(token->r.lockspace_name, leader.space_name, NAME_ID_SIZE); if (!res->name[0]) memcpy(token->r.name, leader.resource_name, NAME_ID_SIZE); - rv = verify_leader(token, &token->disks[0], &leader, checksum, "read_resource"); + if (token->flags & T_CHECK_EXISTS) { + if (leader.magic != PAXOS_DISK_MAGIC) + rv = SANLK_LEADER_MAGIC; + else + rv = SANLK_OK; + } else { + rv = verify_leader(token, &token->disks[0], &leader, checksum, "read_resource"); + } if (rv == SANLK_OK) { memcpy(res->lockspace_name, leader.space_name, NAME_ID_SIZE); memcpy(res->name, leader.resource_name, NAME_ID_SIZE); res->lver = leader.lver; - if (leader.sector_size == 512) - res->flags |= SANLK_RES_ALIGN1M; - else if (leader.sector_size == 4096) - res->flags |= SANLK_RES_ALIGN8M; + if ((leader.sector_size == 512) || (leader.sector_size == 4096)) { + token->sector_size = leader.sector_size; + token->align_size = sector_size_to_align_size(leader.sector_size); + + if (leader.sector_size == 512) + res->flags |= SANLK_RES_ALIGN1M; + else if (leader.sector_size == 4096) + res->flags |= SANLK_RES_ALIGN8M; + } else if (tmp_sector_size) { + /* we don't know the correct value, so don't set any */ + token->sector_size = 0; + token->align_size = 0; + } } return rv; diff --git a/src/resource.c b/src/resource.c index 5d27b0e..d0da60b 100644 --- a/src/resource.c +++ b/src/resource.c @@ -53,7 +53,7 @@ static pthread_mutex_t resource_mutex; static pthread_cond_t resource_cond; static struct list_head host_events; static int resources_free_count; -static uint32_t resource_id_counter = 1; +static uint32_t resource_id_counter = 2; /* id 1 used for internal rindex lease */ #define FREE_RES_COUNT 128 diff --git a/src/rindex.c b/src/rindex.c new file mode 100644 index 0000000..601c8bf --- /dev/null +++ b/src/rindex.c @@ -0,0 +1,1085 @@ +/* + * Copyright 2018 Red Hat, Inc. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v2 or (at your option) any later version. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "sanlock_internal.h" +#include "sanlock_admin.h" +#include "diskio.h" +#include "ondisk.h" +#include "log.h" +#include "paxos_lease.h" +#include "lockspace.h" +#include "resource.h" +#include "task.h" +#include "timeouts.h" +#include "rindex_disk.h" +#include "rindex.h" +#include "paxos_dblock.h" +#include "leader.h" + +struct rindex_info { + struct sanlk_rindex *ri; /* point to sanlk_rindex */ + struct sync_disk *disk; /* points to sanlk_rindex.disk */ + struct rindex_header header; +}; + +/* this token is used for paxos_lease_acquire/release */ + +static struct token *setup_rindex_token(struct rindex_info *rx, + int sector_size, + struct space_info *spi) +{ + struct token *token; + int token_len; + int align_size = sector_size_to_align_size(sector_size); + + token_len = sizeof(struct token) + sizeof(struct sync_disk); + token = malloc(token_len); + if (!token) + return NULL; + + memset(token, 0, token_len); + memcpy(token->r.lockspace_name, rx->ri->lockspace_name, SANLK_NAME_LEN); + strcpy(token->r.name, "rindex_lease"); + token->sector_size = sector_size; + token->align_size = align_size; + token->io_timeout = spi ? spi->io_timeout : DEFAULT_IO_TIMEOUT; + token->r.num_disks = 1; + + token->disks = (struct sync_disk *)&token->r.disks[0]; /* shorthand */ + memcpy(token->disks[0].path, rx->disk->path, SANLK_PATH_LEN); + token->disks[0].offset = rx->disk->offset + align_size; + token->disks[0].fd = rx->disk->fd; + + if (spi) { + token->host_id = spi->host_id; + token->host_generation = spi->host_generation; + token->space_id = spi->space_id; + token->res_id = 1; + } + + return token; +} + +/* this token is only used for paxos_lease_init */ + +static struct token *setup_resource_token(struct rindex_info *rx, + char *res_name, + int sector_size, + struct space_info *spi) +{ + struct token *token; + int token_len; + int align_size = sector_size_to_align_size(sector_size); + + token_len = sizeof(struct token) + sizeof(struct sync_disk); + token = malloc(token_len); + if (!token) + return NULL; + + memset(token, 0, token_len); + memcpy(token->r.lockspace_name, rx->ri->lockspace_name, SANLK_NAME_LEN); + memcpy(token->r.name, res_name, SANLK_NAME_LEN); + token->sector_size = sector_size; + token->align_size = align_size; + token->io_timeout = spi ? spi->io_timeout : DEFAULT_IO_TIMEOUT; + token->r.num_disks = 1; + + token->disks = (struct sync_disk *)&token->r.disks[0]; /* shorthand */ + memcpy(token->disks[0].path, rx->disk->path, SANLK_PATH_LEN); + token->disks[0].fd = rx->disk->fd; + /* there is no offset yet, it is found and set later */ + + return token; +} + +static uint32_t sector_size_to_max_resources(int sector_size) +{ + if (sector_size == 512) + return MAX_RINDEX_ENTRIES_1M; + + if (sector_size == 4096) + return MAX_RINDEX_ENTRIES_8M; + + return 0; +} + +static int search_entries(struct rindex_info *rx, char *rindex_iobuf, + uint64_t *ent_offset, uint64_t *res_offset, + int find_free, char *find_name) +{ + struct rindex_entry re; + struct rindex_entry *re_end; + uint64_t entry_offset_in_rindex; + uint32_t max_resources = rx->header.max_resources; + int sector_size = rx->header.sector_size; + int align_size = sector_size_to_align_size(sector_size); + int i; + + if (!max_resources) + max_resources = sector_size_to_max_resources(sector_size); + + for (i = 0; i < max_resources; i++) { + /* skip first sector which holds header */ + entry_offset_in_rindex = sector_size + (i * sizeof(struct rindex_entry)); + + re_end = (struct rindex_entry *)(rindex_iobuf + entry_offset_in_rindex); + + rindex_entry_in(re_end, &re); + + if (find_free && (!re.res_offset && !re.name[0])) { + *ent_offset = entry_offset_in_rindex; + *res_offset = rx->disk->offset + (2 * align_size) + (i * align_size); + return 0; + } + + if (find_name && re.name[0] && !strncmp(re.name, find_name, SANLK_NAME_LEN)) { + *ent_offset = entry_offset_in_rindex; + *res_offset = rx->disk->offset + (2 * align_size) + (i * align_size); + return 0; + } + } + + return -ENOENT; +} + +static int update_rindex(struct task *task, + struct space_info *spi, + struct rindex_info *rx, + char *rindex_iobuf, + struct sanlk_rentry *re, + uint64_t ent_offset, + uint64_t res_offset, + int delete) +{ + struct rindex_entry re_new; + struct rindex_entry re_end; + char *sector_iobuf; + char **p_iobuf; + uint32_t sector_offset; + uint32_t entry_offset_in_sector; + int sector_size = rx->header.sector_size; + int iobuf_len; + int rv; + + /* + * ent_offset is the offset (in bytes) from the start of the rindex to + * the entry being updated. (This includes the size of the header + * sector; no offsets are calculated from the end of the header + * sector.) + * + * sector_offset is the offset (in bytes) from the start of the rindex + * to the sector containing ent_offset. The entire sector is written. + * + * entry_offset_in_sector is the offset (in bytes) from the start of + * the target sector to the entry being updated. + */ + + sector_offset = (ent_offset / sector_size) * sector_size; + entry_offset_in_sector = ent_offset % sector_size; + + iobuf_len = sector_size; + + p_iobuf = §or_iobuf; + + rv = posix_memalign((void *)p_iobuf, getpagesize(), iobuf_len); + if (rv) + return rv; + + memset(sector_iobuf, 0, iobuf_len); + + memset(&re_new, 0, sizeof(struct rindex_entry)); + + if (!delete) { + memcpy(re_new.name, re->name, NAME_ID_SIZE); + re_new.res_offset = res_offset; + } + + rindex_entry_out(&re_new, &re_end); + + /* initialize new sector with existing index content */ + memcpy(sector_iobuf, rindex_iobuf + sector_offset, sector_size); + + /* replace the specific entry */ + memcpy(sector_iobuf + entry_offset_in_sector, &re_end, sizeof(struct rindex_entry)); + + rv = write_iobuf(rx->disk->fd, rx->disk->offset + sector_offset, sector_iobuf, iobuf_len, task, spi->io_timeout, NULL); + + if (rv != SANLK_AIO_TIMEOUT) + free(sector_iobuf); + + return rv; +} + +static int read_rindex(struct task *task, + struct space_info *spi, + struct rindex_info *rx, + char **rindex_iobuf_ret) +{ + char *iobuf; + char **p_iobuf; + int sector_size = rx->header.sector_size; + int align_size = sector_size_to_align_size(sector_size); + int iobuf_len; + int rv; + + iobuf_len = align_size; + + p_iobuf = &iobuf; + + rv = posix_memalign((void *)p_iobuf, getpagesize(), iobuf_len); + if (rv) { + return rv; + } + + memset(iobuf, 0, iobuf_len); + + rv = read_iobuf(rx->disk->fd, rx->disk->offset, iobuf, iobuf_len, task, spi->io_timeout, NULL); + if (rv < 0) { + free(iobuf); + return rv; + } + + *rindex_iobuf_ret = iobuf; + return rv; +} + +static int read_rindex_header(struct task *task, + struct space_info *spi, + struct rindex_info *rx) +{ + struct rindex_header *rh_end; + char *iobuf; + char **p_iobuf; + int sector_size = spi->sector_size; + int io_timeout = spi->io_timeout; + int iobuf_len; + int rv; + + if (!sector_size) + sector_size = 4096; + if (!io_timeout) { + io_timeout = DEFAULT_IO_TIMEOUT; + spi->io_timeout = io_timeout; + } + + /* + * lockspace sector_size will usually be the same as rindex sector_size. + * use the lockspace sector size for reading the rindex header which + * officially gives us the rindex sector_size. + */ + + iobuf_len = sector_size; + + p_iobuf = &iobuf; + + rv = posix_memalign((void *)p_iobuf, getpagesize(), iobuf_len); + if (rv) + return -ENOMEM; + + rv = read_iobuf(rx->disk->fd, rx->disk->offset, iobuf, iobuf_len, task, io_timeout, NULL); + if (rv < 0) + goto out; + + rh_end = (struct rindex_header *)iobuf; + + rindex_header_in(rh_end, &rx->header); + + if (rx->header.magic != RINDEX_DISK_MAGIC) { + log_debug("rindex header bad magic %x vs %x on %s:%llu", + rx->header.magic, + RINDEX_DISK_MAGIC, + rx->disk->path, + (unsigned long long)rx->disk->offset); + rv = SANLK_RINDEX_MAGIC; + goto out; + } + + if ((rx->header.version & 0xFFFF0000) != RINDEX_DISK_VERSION_MAJOR) { + log_debug("rindex header bad version %x vs %x on %s:%llu", + rx->header.version, + RINDEX_DISK_VERSION_MAJOR, + rx->disk->path, + (unsigned long long)rx->disk->offset); + rv = SANLK_RINDEX_VERSION; + goto out; + } + + if (strcmp(rx->header.lockspace_name, rx->ri->lockspace_name)) { + log_debug("rindex header bad lockspace_name %s vs %s on %s:%llu", + rx->header.lockspace_name, + rx->ri->lockspace_name, + rx->disk->path, + (unsigned long long)rx->disk->offset); + rv = SANLK_RINDEX_LOCKSPACE; + goto out; + } + + if (rx->header.rx_offset != rx->disk->offset) { + log_debug("rindex header bad offset %llu on %s:%llu", + (unsigned long long)rx->header.rx_offset, + rx->disk->path, + (unsigned long long)rx->disk->offset); + rv = SANLK_RINDEX_OFFSET; + goto out; + } +out: + if (rv != SANLK_AIO_TIMEOUT) + free(iobuf); + + return rv; +} + +/* + * format rindex: write new rindex header, and initialize internal paxos lease + * for protecting the rindex. + */ + +int rindex_format(struct task *task, struct sanlk_rindex *ri) +{ + struct rindex_info rx; + struct rindex_header rh; + struct rindex_header rh_end; + struct token *token; + char *iobuf; + char **p_iobuf; + int sector_size, align_size; + int iobuf_len; + int rv; + + memset(&rx, 0, sizeof(rx)); + rx.ri = ri; + rx.disk = (struct sync_disk *)&ri->disk; + + rv = open_disk(rx.disk); + if (rv < 0) { + log_error("rindex_format open failed %d %s", rv, rx.disk->path); + return rv; + } + + if (ri->flags & SANLK_RIF_ALIGN1M) + sector_size = 512; + else if (ri->flags & SANLK_RIF_ALIGN8M) + sector_size = 4096; + else + sector_size = rx.disk->sector_size; + + align_size = sector_size_to_align_size(sector_size); + + iobuf_len = align_size; + + p_iobuf = &iobuf; + + rv = posix_memalign((void *)p_iobuf, getpagesize(), iobuf_len); + if (rv) + goto out_close; + + memset(iobuf, 0, iobuf_len); + + memset(&rh, 0, sizeof(struct rindex_header)); + rh.magic = RINDEX_DISK_MAGIC; + rh.version = RINDEX_DISK_VERSION_MAJOR | RINDEX_DISK_VERSION_MINOR; + rh.sector_size = sector_size; + rh.max_resources = rx.ri->max_resources; + rh.rx_offset = rx.disk->offset; + strncpy(rh.lockspace_name, rx.ri->lockspace_name, NAME_ID_SIZE); + + memset(&rh_end, 0, sizeof(struct rindex_header)); + rindex_header_out(&rh, &rh_end); + + memcpy(iobuf, &rh_end, sizeof(struct rindex_header)); + + rv = write_iobuf(rx.disk->fd, rx.disk->offset, iobuf, iobuf_len, task, DEFAULT_IO_TIMEOUT, NULL); + if (rv < 0) { + log_error("rindex_format write failed %d %s", rv, rx.disk->path); + goto out_iobuf; + } + + token = setup_rindex_token(&rx, sector_size, NULL); + if (!token) { + rv = -ENOMEM; + goto out_iobuf; + } + + rv = paxos_lease_init(task, token, 0, 0, 0); + if (rv < 0) { + log_error("rindex_format lease init failed %d", rv); + goto out_token; + } + + rv = 0; + + out_token: + free(token); + out_iobuf: + if (rv != SANLK_AIO_TIMEOUT) + free(iobuf); + out_close: + close_disks(rx.disk, 1); + return rv; +} + +int rindex_create(struct task *task, struct sanlk_rindex *ri, + struct sanlk_rentry *re, struct sanlk_rentry *re_ret, + uint32_t max_hosts, uint32_t num_hosts) +{ + struct rindex_info rx; + struct space_info spi; + struct leader_record leader; + struct paxos_dblock dblock; + struct token *rx_token; + struct token *res_token; + char *rindex_iobuf = NULL; + uint64_t ent_offset, res_offset; + int sector_size; + int rv; + + memset(&rx, 0, sizeof(rx)); + rx.ri = ri; + rx.disk = (struct sync_disk *)&ri->disk; + + rv = open_disk(rx.disk); + if (rv < 0) { + log_error("rindex_create open failed %d %s", rv, rx.disk->path); + return rv; + } + + /* + * Allows only one rindex op for a given lockspace at a time. + * If there's already one in progress, this returns EBUSY. + * Also collects lockspace info at the same time. + */ + memset(&spi, 0, sizeof(spi)); + + rv = lockspace_begin_rindex_op(ri->lockspace_name, RX_OP_CREATE, &spi); + if (rv < 0) { + log_error("rindex_create lockspace not available %d %s", rv, ri->lockspace_name); + goto out_close; + } + + rv = read_rindex_header(task, &spi, &rx); + if (rv < 0) { + log_error("rindex_create failed to read rindex header %d on %s:%llu", + rv, rx.disk->path, (unsigned long long)rx.disk->offset); + goto out_clear; + } + + sector_size = rx.header.sector_size; + + /* used to acquire the internal paxos lease protecting the rindex */ + rx_token = setup_rindex_token(&rx, sector_size, &spi); + if (!rx_token) { + rv = -ENOMEM; + goto out_clear; + } + + /* used to initialize the new paxos lease for the resource */ + res_token = setup_resource_token(&rx, re->name, sector_size, &spi); + if (!res_token) { + free(rx_token); + rv = -ENOMEM; + goto out_clear; + } + + rv = paxos_lease_acquire(task, rx_token, + PAXOS_ACQUIRE_OWNER_NOWAIT | PAXOS_ACQUIRE_QUIET_FAIL, + &leader, &dblock, 0, 0); + if (rv < 0) { + /* TODO: sleep and retry if this fails because it's held by another host? */ + log_error("rindex_create failed to acquire rindex lease %d", rv); + goto out_token; + } + + rv = read_rindex(task, &spi, &rx, &rindex_iobuf); + if (rv < 0) { + log_error("rindex_create failed to read rindex %d", rv); + goto out_lease; + } + + rv = search_entries(&rx, rindex_iobuf, &ent_offset, &res_offset, 1, NULL); + if (rv < 0) { + log_error("rindex_create failed to find free offset %d", rv); + goto out_iobuf; + } + + /* set the location of the new paxos lease */ + + log_debug("rindex_create found offset %llu for %s:%s", + (unsigned long long)res_offset, + rx.ri->lockspace_name, re->name); + + res_token->disks[0].offset = res_offset; + + /* write the new paxos lease */ + + rv = paxos_lease_init(task, res_token, num_hosts, max_hosts, 0); + if (rv < 0) { + log_error("rindex_create failed to init new lease %d", rv); + goto out_iobuf; + } + + rv = update_rindex(task, &spi, &rx, rindex_iobuf, re, ent_offset, res_offset, 0); + if (rv < 0) { + log_error("rindex_create failed to update rindex %d", rv); + goto out_iobuf; + } + + log_debug("rindex_create updated rindex entry %llu for %s %llu", + (unsigned long long)ent_offset, + re->name, + (unsigned long long)res_offset); + + re_ret->offset = res_offset; + rv = 0; + + out_iobuf: + free(rindex_iobuf); + out_lease: + paxos_lease_release(task, rx_token, NULL, &leader, &leader); + out_token: + free(rx_token); + free(res_token); + out_clear: + lockspace_clear_rindex_op(ri->lockspace_name); + out_close: + close_disks(rx.disk, 1); + return rv; +} + +/* + * clear the rindex entry for a given resource lease name and offset + * first the rentry is cleared, then the resource lease is cleared + */ + +int rindex_delete(struct task *task, struct sanlk_rindex *ri, + struct sanlk_rentry *re, struct sanlk_rentry *re_ret) +{ + struct rindex_info rx; + struct space_info spi; + struct leader_record leader; + struct paxos_dblock dblock; + struct token *rx_token; + struct token *res_token; + char *rindex_iobuf = NULL; + uint64_t res_offset = re->offset; + uint64_t ent_offset; + int entry_num; + int sector_size, align_size; + int rv; + + memset(&rx, 0, sizeof(rx)); + rx.ri = ri; + rx.disk = (struct sync_disk *)&ri->disk; + + rv = open_disk(rx.disk); + if (rv < 0) { + log_error("rindex_create open failed %d %s", rv, rx.disk->path); + return rv; + } + + /* + * Allows only one rindex op for a given lockspace at a time. + * If there's already one in progress, this returns EBUSY. + * Also collects lockspace info at the same time. + */ + memset(&spi, 0, sizeof(spi)); + + rv = lockspace_begin_rindex_op(ri->lockspace_name, RX_OP_DELETE, &spi); + if (rv < 0) { + log_error("rindex_delete lockspace not available %d %s", rv, ri->lockspace_name); + goto out_close; + } + + rv = read_rindex_header(task, &spi, &rx); + if (rv < 0) { + log_error("rindex_delete failed to read rindex header %d on %s:%llu", + rv, rx.disk->path, (unsigned long long)rx.disk->offset); + goto out_clear; + } + + sector_size = rx.header.sector_size; + align_size = sector_size_to_align_size(sector_size); + entry_num = (res_offset - rx.disk->offset - (2 * align_size)) / align_size; + ent_offset = sector_size + (entry_num * sizeof(struct rindex_entry)); + + if (re->offset && (re->offset % align_size)) { + rv = SANLK_RINDEX_OFFSET; + goto out_clear; + } + + /* used to acquire the internal paxos lease protecting the rindex */ + rx_token = setup_rindex_token(&rx, sector_size, &spi); + if (!rx_token) { + rv = -ENOMEM; + goto out_clear; + } + + /* used to write the cleared paxos lease for the resource */ + res_token = setup_resource_token(&rx, re->name, sector_size, &spi); + if (!res_token) { + free(rx_token); + rv = -ENOMEM; + goto out_clear; + } + res_token->disks[0].offset = res_offset; + + rv = paxos_lease_acquire(task, rx_token, + PAXOS_ACQUIRE_OWNER_NOWAIT | PAXOS_ACQUIRE_QUIET_FAIL, + &leader, &dblock, 0, 0); + if (rv < 0) { + /* TODO: sleep and retry if this fails because it's held by another host? */ + log_error("rindex_create failed to acquire rindex lease %d", rv); + goto out_token; + } + + rv = read_rindex(task, &spi, &rx, &rindex_iobuf); + if (rv < 0) { + log_error("rindex_delete failed to read rindex %d", rv); + goto out_lease; + } + + rv = update_rindex(task, &spi, &rx, rindex_iobuf, re, ent_offset, res_offset, 1); + if (rv < 0) { + log_error("rindex_delete failed to update rindex %d", rv); + goto out_iobuf; + } + + /* clear the paxos lease */ + + rv = paxos_lease_init(task, res_token, 0, 0, 1); + if (rv < 0) { + log_error("rindex_delete failed to init new lease %d", rv); + goto out_iobuf; + } + + log_debug("rindex_delete updated rindex entry %llu for %s %llu", + (unsigned long long)ent_offset, + re->name, + (unsigned long long)res_offset); + + re_ret->offset = 0; + + rv = 0; + + out_iobuf: + free(rindex_iobuf); + out_lease: + paxos_lease_release(task, rx_token, NULL, &leader, &leader); + out_token: + free(rx_token); + free(res_token); + out_clear: + lockspace_clear_rindex_op(ri->lockspace_name); + out_close: + close_disks(rx.disk, 1); + return rv; +} + +int rindex_lookup(struct task *task, struct sanlk_rindex *ri, + struct sanlk_rentry *re, struct sanlk_rentry *re_ret, uint32_t cmd_flags) +{ + struct rindex_info rx; + struct space_info spi; + struct rindex_entry re_in; + struct rindex_entry *re_end; + char *rindex_iobuf = NULL; + uint64_t ent_offset, res_offset; + int entry_num; + int sector_size, align_size; + int nolock = cmd_flags & SANLK_RX_NO_LOCKSPACE; + int rv; + + memset(&rx, 0, sizeof(rx)); + rx.ri = ri; + rx.disk = (struct sync_disk *)&ri->disk; + + rv = open_disk(rx.disk); + if (rv < 0) { + return rv; + } + + memset(&spi, 0, sizeof(spi)); + + if (!nolock) { + rv = lockspace_begin_rindex_op(ri->lockspace_name, RX_OP_LOOKUP, &spi); + if (rv < 0) { + goto out_close; + } + } + + rv = read_rindex_header(task, &spi, &rx); + if (rv < 0) { + goto out_clear; + } + + rv = read_rindex(task, &spi, &rx, &rindex_iobuf); + if (rv < 0) { + goto out_clear; + } + + sector_size = rx.header.sector_size; + align_size = sector_size_to_align_size(sector_size); + + if (re->offset && (re->offset % align_size)) { + rv = SANLK_RINDEX_OFFSET; + goto out_clear; + } + + if (!re->name[0] && !re->offset) { + /* find the first free resource lease offset */ + + rv = search_entries(&rx, rindex_iobuf, &ent_offset, &res_offset, 1, NULL); + if (rv < 0) { + goto out_iobuf; + } + + memset(re_ret->name, 0, SANLK_NAME_LEN); + re_ret->offset = res_offset; + rv = 0; + + } else if (!re->name[0] && re->offset) { + /* find the name of the resource lease that the index has recorded + for the given resource lease offset */ + + res_offset = re->offset; + entry_num = (res_offset - rx.disk->offset - (2 * align_size)) / align_size; + ent_offset = sector_size + (entry_num * sizeof(struct rindex_entry)); + + re_end = (struct rindex_entry *)(rindex_iobuf + ent_offset); + + rindex_entry_in(re_end, &re_in); + + memcpy(re_ret->name, re_in.name, SANLK_NAME_LEN); + re_ret->offset = res_offset; + rv = 0; + + } else if (re->name[0] && !re->offset) { + /* search the rindex entries for a given resource lease name and + if found return the offset of the resource lease */ + + rv = search_entries(&rx, rindex_iobuf, &ent_offset, &res_offset, 0, re->name); + if (rv < 0) { + goto out_iobuf; + } + + memcpy(re_ret->name, re->name, SANLK_NAME_LEN); + re_ret->offset = res_offset; + rv = 0; + + } else if (re->name[0] && re->offset) { + /* find the name of the resource lease that the index has recorded + for the given resource lease offset, and if it doesn't match + the specified name, then it's an error */ + + res_offset = re->offset; + entry_num = (res_offset - rx.disk->offset - (2 * align_size)) / align_size; + ent_offset = sector_size + (entry_num * sizeof(struct rindex_entry)); + + re_end = (struct rindex_entry *)(rindex_iobuf + ent_offset); + + rindex_entry_in(re_end, &re_in); + + if (strncmp(re->name, re_in.name, SANLK_NAME_LEN)) + rv = SANLK_RINDEX_DIFF; + else + rv = 0; + + memcpy(re_ret->name, re_in.name, SANLK_NAME_LEN); + re_ret->offset = res_offset; + } + + + out_iobuf: + free(rindex_iobuf); + out_clear: + if (!nolock) + lockspace_clear_rindex_op(ri->lockspace_name); + out_close: + close_disks(rx.disk, 1); + return rv; +} + +int rindex_update(struct task *task, struct sanlk_rindex *ri, + struct sanlk_rentry *re, struct sanlk_rentry *re_ret, + uint32_t cmd_flags) +{ + struct rindex_info rx; + struct space_info spi; + char *rindex_iobuf = NULL; + uint64_t ent_offset, res_offset; + int entry_num; + int sector_size, align_size; + int op_remove = 0, op_add = 0; + int nolock = cmd_flags & SANLK_RX_NO_LOCKSPACE; + int rv; + + memset(&rx, 0, sizeof(rx)); + rx.ri = ri; + rx.disk = (struct sync_disk *)&ri->disk; + + rv = open_disk(rx.disk); + if (rv < 0) { + return rv; + } + + memset(&spi, 0, sizeof(spi)); + + if (!nolock) { + rv = lockspace_begin_rindex_op(ri->lockspace_name, RX_OP_UPDATE, &spi); + if (rv < 0) { + goto out_close; + } + } + + rv = read_rindex_header(task, &spi, &rx); + if (rv < 0) { + goto out_clear; + } + + rv = read_rindex(task, &spi, &rx, &rindex_iobuf); + if (rv < 0) { + goto out_clear; + } + + sector_size = rx.header.sector_size; + align_size = sector_size_to_align_size(sector_size); + + if (re->offset && (re->offset % align_size)) { + rv = SANLK_RINDEX_OFFSET; + goto out_clear; + } + + res_offset = re->offset; + entry_num = (res_offset - rx.disk->offset - (2 * align_size)) / align_size; + ent_offset = sector_size + (entry_num * sizeof(struct rindex_entry)); + + if ((cmd_flags & SANLK_RXUP_REM) && re->offset) { + op_remove = 1; + } else if ((cmd_flags & SANLK_RXUP_ADD) && re->name[0] && re->offset) { + op_add = 1; + } else { + rv = -EINVAL; + goto out_iobuf; + } + + rv = update_rindex(task, &spi, &rx, rindex_iobuf, re, ent_offset, res_offset, op_remove); + if (rv < 0) { + log_error("rindex_update failed to update rindex %d", rv); + goto out_iobuf; + } + rv = 0; + + if (op_remove) { + memset(re_ret->name, 0, SANLK_NAME_LEN); + re_ret->offset = 0; + } + if (op_add) { + memcpy(re_ret->name, re->name, SANLK_NAME_LEN); + re_ret->offset = res_offset; + } + + out_iobuf: + free(rindex_iobuf); + out_clear: + if (!nolock) + lockspace_clear_rindex_op(ri->lockspace_name); + out_close: + close_disks(rx.disk, 1); + return rv; +} + +int rindex_rebuild(struct task *task, struct sanlk_rindex *ri, uint32_t cmd_flags) +{ + struct rindex_info rx; + struct rindex_entry re_new; + struct rindex_entry re_end; + struct space_info spi; + struct leader_record leader; + struct paxos_dblock dblock; + struct token *rx_token; + struct token *res_token; + struct sanlk_resource res; + char *rindex_iobuf = NULL; + uint64_t res_offset; + uint64_t ent_offset; + uint32_t max_resources; + int sector_size, align_size; + int nolock = cmd_flags & SANLK_RX_NO_LOCKSPACE; + int i, rv; + + memset(&rx, 0, sizeof(rx)); + rx.ri = ri; + rx.disk = (struct sync_disk *)&ri->disk; + + rv = open_disk(rx.disk); + if (rv < 0) { + log_error("rindex_rebuild open failed %d %s", rv, rx.disk->path); + return rv; + } + + /* + * Allows only one rindex op for a given lockspace at a time. + * If there's already one in progress, this returns EBUSY. + * Also collects lockspace info at the same time. + */ + memset(&spi, 0, sizeof(spi)); + + if (!nolock) { + rv = lockspace_begin_rindex_op(ri->lockspace_name, RX_OP_REBUILD, &spi); + if (rv < 0) { + log_error("rindex_rebuild lockspace not available %d %s", rv, ri->lockspace_name); + goto out_close; + } + } + + rv = read_rindex_header(task, &spi, &rx); + if (rv < 0) { + log_error("rindex_rebuild failed to read rindex header %d on %s:%llu", + rv, rx.disk->path, (unsigned long long)rx.disk->offset); + goto out_clear; + } + + sector_size = rx.header.sector_size; + align_size = sector_size_to_align_size(sector_size); + + /* used to acquire the internal paxos lease protecting the rindex */ + rx_token = setup_rindex_token(&rx, sector_size, &spi); + if (!rx_token) { + rv = -ENOMEM; + goto out_clear; + } + + memset(&res, 0, sizeof(res)); + + res_token = setup_resource_token(&rx, res.name, sector_size, &spi); + if (!res_token) { + free(rx_token); + rv = -ENOMEM; + goto out_clear; + } + + if (!nolock) { + rv = paxos_lease_acquire(task, rx_token, + PAXOS_ACQUIRE_OWNER_NOWAIT | PAXOS_ACQUIRE_QUIET_FAIL, + &leader, &dblock, 0, 0); + if (rv < 0) { + /* TODO: sleep and retry if this fails because it's held by another host? */ + log_error("rindex_rebuild failed to acquire rindex lease %d", rv); + goto out_token; + } + } + + rv = read_rindex(task, &spi, &rx, &rindex_iobuf); + if (rv < 0) { + log_error("rindex_rebuild failed to read rindex %d", rv); + goto out_lease; + } + + if (rx.ri->max_resources) + max_resources = rx.ri->max_resources; + else + max_resources = sector_size_to_max_resources(sector_size); + + /* + * Zero all the entries after the header sector and lease sector. + * Entries will be recreated in the zeroed space if corresponding + * resource leases are found. + */ + memset(rindex_iobuf + (2 * sector_size), 0, align_size - (2 * sector_size)); + + /* + * We read each potential resource lease offset to check if a + * lease exists there. It's ok if there is none, and we don't + * want to log errors if none is found. + */ + res_token->flags |= T_CHECK_EXISTS; + + /* + * Read each potential resource lease area and add an rindex entry + * for each one that's found. Resource leases begin after + * the rindex area and the rindex lease area. + */ + res_offset = rx.disk->offset + (2 * align_size); + + for (i = 0; i < max_resources; i++) { + memset(&re_new, 0, sizeof(re_new)); + memset(&re_end, 0, sizeof(re_end)); + memset(&res, 0, sizeof(res)); + memset(res_token->r.name, 0, SANLK_NAME_LEN); + res_token->disks[0].offset = res_offset; + + rv = paxos_read_resource(task, res_token, &res); + + /* end of device */ + if (rv == -EMSGSIZE) { + log_debug("rindex_rebuild reached end of device at %llu", + (unsigned long long)res_offset); + break; + } + + if (rv == SANLK_OK) { + log_debug("rindex_rebuild found %s at %llu", + res.name, (unsigned long long)res_offset); + + re_new.res_offset = res_offset; + memcpy(re_new.name, res.name, SANLK_NAME_LEN); + rindex_entry_out(&re_new, &re_end); + + /* Within rindex, entries begin after the header sector */ + ent_offset = sector_size + (i * sizeof(struct rindex_entry)); + + memcpy(rindex_iobuf + ent_offset, &re_end, sizeof(re_end)); + } else { + log_debug("rindex_rebuild found no resource at %llu %d", + (unsigned long long)res_offset, rv); + } + + res_offset += align_size; + } + + rv = write_iobuf(rx.disk->fd, rx.disk->offset, rindex_iobuf, align_size, task, spi.io_timeout, NULL); + if (rv < 0) { + if (rv != SANLK_AIO_TIMEOUT) + free(rindex_iobuf); + log_error("rindex_rebuild write failed %d %s", rv, rx.disk->path); + goto out_lease; + } + + rv = 0; + + free(rindex_iobuf); + out_lease: + if (!nolock) + paxos_lease_release(task, rx_token, NULL, &leader, &leader); + out_token: + free(rx_token); + free(res_token); + out_clear: + if (!nolock) + lockspace_clear_rindex_op(ri->lockspace_name); + out_close: + close_disks(rx.disk, 1); + return rv; +} + diff --git a/src/rindex.h b/src/rindex.h new file mode 100644 index 0000000..5b5beed --- /dev/null +++ b/src/rindex.h @@ -0,0 +1,25 @@ +/* + * Copyright 2018 Red Hat, Inc. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v2 or (at your option) any later version. + */ + +#ifndef __RINDEX_H__ +#define __RINDEX_H__ + +int rindex_format(struct task *task, struct sanlk_rindex *ri); +int rindex_rebuild(struct task *task, struct sanlk_rindex *ri, uint32_t cmd_flags); + +int rindex_lookup(struct task *task, struct sanlk_rindex *ri, + struct sanlk_rentry *re, struct sanlk_rentry *re_ret, uint32_t cmd_flags); +int rindex_update(struct task *task, struct sanlk_rindex *ri, + struct sanlk_rentry *re, struct sanlk_rentry *re_ret, uint32_t cmd_flags); + +int rindex_create(struct task *task, struct sanlk_rindex *ri, + struct sanlk_rentry *re, struct sanlk_rentry *re_ret, + uint32_t num_hosts, uint32_t max_hosts); +int rindex_delete(struct task *task, struct sanlk_rindex *ri, + struct sanlk_rentry *re, struct sanlk_rentry *re_ret); +#endif diff --git a/src/rindex_disk.h b/src/rindex_disk.h new file mode 100644 index 0000000..8889c59 --- /dev/null +++ b/src/rindex_disk.h @@ -0,0 +1,95 @@ +/* + * Copyright 2018 Red Hat, Inc. + * + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the GNU General Public License v2 or (at your option) any later version. + */ + +#ifndef __RINDEX_DISK_H__ +#define __RINDEX_DISK_H__ + +/* + * The resource index is uses two align-size (1/8M) areas. + * + * The first area (the rindex itself) holds a header and entrys, + * with each entry recording a resource lease name and the + * offset of that lease (the resource leases follow the index + * areas.) + * + * The second area holds an internal paxos lease that sanlock + * uses to protect updates to the rindex. + * + * The rindex is one align-size area containing 2048 sectors. + * The sector 0 of the index holds the rindex_header. + * After this, sectors 1-2000 of the index hold rindex_entry's. + * The final 47 sectors are unused. + * + * 512 byte sectors hold 8 entries per sector, so 2000 sectors + * holds up to 16000 entries. + * + * 4096 byte sectors hold 64 entries per sector, so 2000 sectors + * holds up to 128000 entries. + * + * rindex_header.sector_size = 512 | 4096 + * + * area_size = 1M | 8M + * (determined from sector_size) + * + * rindex_header.max_resources defaults to 4096 to limit searching. + * The caller can specify up to 16000 | 128000 max_resources. + * + * rindex_header.rindex_offset: + * location of rindex_header from start of device, set by caller, + * must be multiple of area_size. (rindex_offset will often be + * 1*area_size because rindex typically follows the lockspace area + * which typically starts at offset 0 on the device.) + * + * entry_size = 64 bytes + * + * entry_index = N = 0 to (max_resources - 1) + * + * rindex_entry N offset = rindex_offset + sector_size + (N * entry_size) + * (the sector_size contains the rindex_header) + * + * rindex_entry N holds information about the resource lease in + * the N'th area following the two areas used by the resource index. + * + * resource_leases_start = rindex_offset + (2 * area_size) + * resource leases begin after the two resource index areas. + * (rindex_offset will often be area_size, so resource_leases_start + * will often by 3*area_size) + * + * resource lease N offset = resource_leases_start + (N * area_size) + * + * rindex_entry[N].res_offset = resource lease N offset + */ + +#define RINDEX_DISK_MAGIC 0x01042018 +#define RINDEX_DISK_VERSION_MAJOR 0x00010000 +#define RINDEX_DISK_VERSION_MINOR 0x00000001 + +struct rindex_header { + uint32_t magic; + uint32_t version; + uint32_t flags; + uint32_t sector_size; + uint32_t max_resources; + uint32_t unused; + uint64_t rx_offset; /* location of rindex_header from start of disk */ + char lockspace_name[NAME_ID_SIZE]; +}; + +#define MAX_RINDEX_ENTRIES_1M 16000 +#define MAX_RINDEX_ENTRIES_8M 128000 + +/* The entry size is fixed */ + +struct rindex_entry { + uint64_t res_offset; /* location of resource from start of disk */ + uint32_t flags; + uint32_t unused; + char name[NAME_ID_SIZE]; +}; + +#endif diff --git a/src/sanlock.8 b/src/sanlock.8 index 4cf607e..a866206 100644 --- a/src/sanlock.8 +++ b/src/sanlock.8 @@ -760,8 +760,8 @@ the single sector of a delta lease, or the first sector of a paxos lease. \fR[\fP\fB:\fP\fIoffset\fP\fR[\fP\fB:\fP\fIsize\fP\fR]]\fP Read disk sectors and print leader records for delta or paxos leases. Add --f 1 to print the request record values for paxos leases, and host_ids set -in delta lease bitmaps. +-f 1 to print the request record values for paxos leases, host_ids set +in delta lease bitmaps, and rindex entries. .SS LOCKSPACE option string diff --git a/src/sanlock.h b/src/sanlock.h index b7feddc..d19cac2 100644 --- a/src/sanlock.h +++ b/src/sanlock.h @@ -104,6 +104,25 @@ struct sanlk_resource { struct sanlk_disk disks[0]; }; +/* make these values match the RES equivalent in case of typos */ +#define SANLK_RIF_ALIGN1M 0x00000010 /* uses 512 sectors */ +#define SANLK_RIF_ALIGN8M 0x00000020 /* uses 4k sectors */ + +struct sanlk_rindex { + uint32_t flags; /* SANLK_RIF_ */ + uint32_t max_resources; /* the max res structs that will follow rindex */ + uint64_t unused; + char lockspace_name[SANLK_NAME_LEN]; /* terminating \0 not required */ + struct sanlk_disk disk; /* location of rindex */ +}; + +struct sanlk_rentry { + char name[SANLK_NAME_LEN]; /* terminating \0 not required */ + uint64_t offset; + uint32_t flags; + uint32_t unused; +}; + /* command-specific command options (can include per resource data, but that requires the extra work of segmenting it by resource name) */ diff --git a/src/sanlock_admin.h b/src/sanlock_admin.h index c4711e9..5d37592 100644 --- a/src/sanlock_admin.h +++ b/src/sanlock_admin.h @@ -335,6 +335,119 @@ int sanlock_test_resource_owners(struct sanlk_resource *res, uint32_t flags, struct sanlk_host *hosts, int hosts_count, uint32_t *test_flags); +/* + * A resource index stores the disk locations (offsets) of resource leases. + * Using it is optional; an application can keep track of lease offsets + * without using the index. + * + * On disk, a resource index uses two alignment-sized regions. + * The first holds the records mapping resource names to offsets. + * The second holds a paxos lease that sanlock uses internally + * to protect updates to the index. The caller chooses the disk + * location of the resource index (path and offset), and passes + * this as a parameter to all functions that use the index with + * struct sanlk_rindex. + * + * The resource index is followed on disk by the resource leases + * that it references. So, using the index removes the ability of + * the application to place resource leases at any disk location. + * A caller would usually place the resource index after a lockspace + * struct on disk (not required.) + * + * The resource index and the following resource leases must all use + * the same align size/flag. + * + * The rindex specifies the lockspace name that the referenced resource + * leases are associated with. This lockspace will also be used for + * the internal rindex paxos lease. + * sanlock must be a member of the lockspace to use the create/delete + * resource functions. + * + * format + * ------ + * Initializes resource index at the specified offset and + * initializes an internal paxos lease in the following area. + * Set the ALIGN flag in sanlk_rindex corresponding to the desired + * sector size; the align size used for the rindex must match the + * align size used for resources. + * + * lookup + * ------ + * Looks up a value in the resource index. When a res name is set + * and *offset is 0, this searches for an entry with the matching + * name and if found sets the res lease offset. When res name is not + * set and an *offset is not 0, this checks for an entry with the given + * res lease offset and if found sets the res name. When name and + * offset or both unset, the first free entry is returned in offset. + * All resource lease offsets are relative to the start of the device. + * sanlock does not acquire the internal rindex paxos lease. + * (The offsets are the disk locations of the resource leases, not + * the disk locations of the rindex entries for the resource leases.) + * + * update + * ------ + * Add or remove an rindex entry. When adding, the rentry + * name and offset must both be set, and the index entry is + * set to indicate the named resource lease exists at the + * specified offset. WHen removing, the rentry offset needs + * to be set, and the index entry for that offset is cleared. + * This is not generally used; the create/delete interfaces are + * the standard method for updating the index. + * + * create_resource + * --------------- + * Searches the index for a free resource lease area, initializes a new + * resource lease at that offset, and updates the index for + * the new lease. Returns the offset of the new resource lease. + * sanlock holds the internal rindex paxos lease around the index + * lookup, resource init and index update. The new lease is initialized + * before the index is updated, so the index will not reference + * an uninitialized area if the host fails during create_resource. + * + * delete_resource + * --------------- + * Updates the index to remove the entry for the named resource lease, + * and clears the resource lease at that offset. + * sanlock holds the internal rindex paxos lease around the + * index update and lease reinitialization. If sanlock fails + * after the index update but before clearing the resource, a + * subsequent create will overwrite the uncleared resource. + * + * rebuild + * ------- + * Rebuilds the rindex based on resource leases that are found. + * Reads each potential resource lease area to check if a + * resource lease exists at that offset. If so, an rindex + * entry is added with that resource name and offset. + */ + +/* + * generic rindex flags use lower 16 bits + * specific rindex function function use upper 16 bits + */ +#define SANLK_RX_NO_LOCKSPACE 0x000000001 /* don't use the lockspace */ + +/* update_rindex flags */ +#define SANLK_RXUP_ADD 0x00010000 +#define SANLK_RXUP_REM 0x00020000 + +int sanlock_format_rindex(struct sanlk_rindex *rx, uint32_t flags); + +int sanlock_update_rindex(struct sanlk_rindex *rx, uint32_t flags, + struct sanlk_rentry *re); + +int sanlock_lookup_rindex(struct sanlk_rindex *rx, uint32_t flags, + struct sanlk_rentry *re); + +int sanlock_rebuild_rindex(struct sanlk_rindex *rx, uint32_t flags); + +int sanlock_create_resource(struct sanlk_rindex *rx, uint32_t flags, + struct sanlk_rentry *re, + int max_hosts, int num_hosts); + +int sanlock_delete_resource(struct sanlk_rindex *rx, uint32_t flags, + struct sanlk_rentry *re); + int sanlock_version(uint32_t flags, uint32_t *version, uint32_t *proto); /* diff --git a/src/sanlock_internal.h b/src/sanlock_internal.h index 047f4db..fd9e5b7 100644 --- a/src/sanlock_internal.h +++ b/src/sanlock_internal.h @@ -28,6 +28,7 @@ #include "leader.h" #include "paxos_dblock.h" #include "mode_block.h" +#include "rindex_disk.h" #include "list.h" #include "monotime.h" @@ -84,6 +85,7 @@ struct delta_extra { #define T_RESTRICT_SIGTERM 0x00000002 /* inherited from client->restricted */ #define T_RETRACT_PAXOS 0x00000004 #define T_WRITE_DBLOCK_MBLOCK_SH 0x00000008 /* make paxos layer include mb SHARED with dblock */ +#define T_CHECK_EXISTS 0x00000010 /* make paxos layer not error if reading lease finds none */ struct token { /* values copied from acquire res arg */ @@ -113,6 +115,12 @@ struct token { struct sync_disk *disks; /* shorthand, points to r.disks[0] */ struct sanlk_resource r; + /* + * sanlk_resource must be the last element of token. + * sanlk_resource ends with sanlk_disk disks[0], + * and allocating a token allocates N sanlk_disk structs + * after the token struct so they follow the sanlk_resource. + */ }; #define R_SHARED 0x00000001 @@ -198,6 +206,7 @@ struct space { uint32_t flags; /* SP_ */ uint32_t used_retries; uint32_t renewal_read_extend_sec; /* defaults to io_timeout */ + uint32_t rindex_op; int sector_size; int align_size; int renew_fail; @@ -232,6 +241,13 @@ struct space_info { int killing_pids; }; +#define RX_OP_FORMAT 1 +#define RX_OP_CREATE 2 +#define RX_OP_DELETE 3 +#define RX_OP_LOOKUP 4 +#define RX_OP_UPDATE 5 +#define RX_OP_REBUILD 6 + #define HOSTID_AIO_CB_SIZE 4 #define WORKER_AIO_CB_SIZE 2 #define DIRECT_AIO_CB_SIZE 1 @@ -359,6 +375,9 @@ struct command_line { char our_host_name[SANLK_NAME_LEN+1]; char *file_path; char *dump_path; + int rindex_op; + struct sanlk_rentry rentry; /* -e */ + struct sanlk_rindex rindex; /* -x RINDEX */ struct sanlk_lockspace lockspace; /* -s LOCKSPACE */ struct sanlk_resource *res_args[SANLK_MAX_RESOURCES]; /* -r RESOURCE */ }; @@ -402,6 +421,12 @@ enum { ACT_SET_CONFIG, ACT_WRITE_LEADER, ACT_RENEWAL, + ACT_FORMAT, + ACT_CREATE, + ACT_DELETE, + ACT_LOOKUP, + ACT_UPDATE, + ACT_REBUILD, }; EXTERN int external_shutdown; diff --git a/src/sanlock_rv.h b/src/sanlock_rv.h index bafef2b..1fcd5a4 100644 --- a/src/sanlock_rv.h +++ b/src/sanlock_rv.h @@ -65,4 +65,11 @@ #define SANLK_REQUEST_OLD -272 #define SANLK_REQUEST_LVER -273 +/* rindex ops */ +#define SANLK_RINDEX_MAGIC -274 +#define SANLK_RINDEX_VERSION -275 +#define SANLK_RINDEX_LOCKSPACE -276 +#define SANLK_RINDEX_OFFSET -277 +#define SANLK_RINDEX_DIFF -278 + #endif diff --git a/src/sanlock_sock.h b/src/sanlock_sock.h index 9b8aad7..0121b9c 100644 --- a/src/sanlock_sock.h +++ b/src/sanlock_sock.h @@ -54,6 +54,12 @@ enum { SM_CMD_SET_EVENT = 32, SM_CMD_SET_CONFIG = 33, SM_CMD_RENEWAL = 34, + SM_CMD_FORMAT_RINDEX = 35, + SM_CMD_UPDATE_RINDEX = 36, + SM_CMD_LOOKUP_RINDEX = 37, + SM_CMD_CREATE_RESOURCE = 38, + SM_CMD_DELETE_RESOURCE = 39, + SM_CMD_REBUILD_RINDEX = 40, }; #define SM_CB_GET_EVENT 1