From 15dba96375ce98ecd7ec00e0812eb469d5783aec Mon Sep 17 00:00:00 2001 From: Vitaliy Filippov Date: Fri, 9 Oct 2020 21:56:47 +0000 Subject: [PATCH] Implement inode removal tool. Removes multiple objects from multiple OSDs in parallel --- Makefile | 7 +- README.md | 19 +-- cluster_client.cpp | 20 ++- cluster_client.h | 9 +- nbd_proxy.cpp | 2 +- rm_inode.cpp | 325 +++++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 369 insertions(+), 13 deletions(-) create mode 100644 rm_inode.cpp diff --git a/Makefile b/Makefile index 207e9f14..aa2ca0b9 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ BLOCKSTORE_OBJS := allocator.o blockstore.o blockstore_impl.o blockstore_init.o blockstore_write.o blockstore_sync.o blockstore_stable.o blockstore_rollback.o blockstore_flush.o crc32c.o ringloop.o # -fsanitize=address CXXFLAGS := -g -O3 -Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fPIC -fdiagnostics-color=always -all: libfio_blockstore.so osd libfio_sec_osd.so libfio_cluster.so stub_osd stub_uring_osd stub_bench osd_test dump_journal qemu_driver.so nbd_proxy +all: libfio_blockstore.so osd libfio_sec_osd.so libfio_cluster.so stub_osd stub_uring_osd stub_bench osd_test dump_journal qemu_driver.so nbd_proxy rm_inode clean: rm -f *.o @@ -44,6 +44,9 @@ libfio_cluster.so: fio_cluster.o $(FIO_CLUSTER_OBJS) nbd_proxy: nbd_proxy.o $(FIO_CLUSTER_OBJS) g++ $(CXXFLAGS) -ltcmalloc_minimal -o $@ $< $(FIO_CLUSTER_OBJS) -luring +rm_inode: rm_inode.o $(FIO_CLUSTER_OBJS) + g++ $(CXXFLAGS) -ltcmalloc_minimal -o $@ $< $(FIO_CLUSTER_OBJS) -luring + qemu_driver.o: qemu_driver.c qemu_proxy.h gcc -I qemu/b/qemu `pkg-config glib-2.0 --cflags` \ -I qemu/include $(CXXFLAGS) -c -o $@ $< @@ -149,6 +152,8 @@ qemu_proxy.o: qemu_proxy.cpp cluster_client.h etcd_state_client.h http_client.h g++ $(CXXFLAGS) -c -o $@ $< ringloop.o: ringloop.cpp ringloop.h g++ $(CXXFLAGS) -c -o $@ $< +rm_inode.o: rm_inode.cpp cluster_client.h etcd_state_client.h http_client.h json11/json11.hpp malloc_or_die.h messenger.h object_id.h osd_id.h osd_ops.h ringloop.h timerfd_manager.h + g++ $(CXXFLAGS) -c -o $@ $< rw_blocking.o: rw_blocking.cpp rw_blocking.h g++ $(CXXFLAGS) -c -o $@ $< stub_bench.o: stub_bench.cpp object_id.h osd_id.h osd_ops.h rw_blocking.h diff --git a/README.md b/README.md index c13718f1..8d1176b8 100644 --- a/README.md +++ b/README.md @@ -31,12 +31,12 @@ breaking changes in the future. However, the following is implemented: - QEMU driver (built out-of-tree) - Loadable fio engine for benchmarks (also built out-of-tree) - NBD proxy for kernel mounts +- Inode removal tool (./rm_inode) ## Roadmap - Packaging for Debian and, probably, CentOS too - OSD creation tool (OSDs currently have to be created by hand) -- Inode deletion tool (currently you can't delete anything :)) - Other administrative tools - Per-inode I/O and space usage statistics - jerasure EC support with any number of data and parity drives in a group @@ -333,19 +333,22 @@ and calculate disk offsets almost by hand. This will be fixed in near future. -device virtio-blk-pci,scsi=off,bus=pci.0,addr=0x5,drive=drive-virtio-disk0,id=virtio-disk0,bootindex=1,write-cache=off,physical_block_size=4096,logical_block_size=512 -vnc 0.0.0.0:0 ``` +- Remove inode with (for example): + ``` + ./rm_inode --etcd_address 10.115.0.10:2379/v3 --pool 1 --inode 1 --parallel_osds 16 --iodepth 32 + ``` ## Known Problems - OSDs may currently crash with "can't get SQE, will fall out of sync with EPOLLET" if you try to load them with very long iodepths because io_uring queue (ring) is limited and OSDs don't check if it fills up. -- Object deletion requests may currently lead to unfound objects on crashes because - proper handling of deletions in a cluster requires a "three-phase cleanup process" - and it's currently not implemented. In fact, even though deletion requests are - implemented, there's no user tool to delete anything from the cluster yet :). - Of course I'll create such tool, but its first implementation will be vulnerable to this issue. - It's not a big deal though, because you'll be able to just repeat the deletion request - in this case. +- Object deletion requests may currently lead to 'incomplete' objects if your OSDs crash during + deletion because proper handling of object cleanup in a cluster should be "three-phase" + and it's currently not implemented. Inode removal tool currently can't handle unclean + objects, so incomplete objects become undeletable. This will be fixed in near future + by allowing the inode removal tool to delete unclean objects. With this problem fixed + you'll be able just to repeat the removal again. ## Implementation Principles diff --git a/cluster_client.cpp b/cluster_client.cpp index c3f46602..906522ed 100644 --- a/cluster_client.cpp +++ b/cluster_client.cpp @@ -203,6 +203,12 @@ void cluster_client_t::on_load_pgs_hook(bool success) { pg_counts[pool_item.first] = pool_item.second.real_pg_count; } + pgs_loaded = true; + for (auto fn: on_ready_hooks) + { + fn(); + } + on_ready_hooks.clear(); for (auto op: offline_ops) { execute(op); @@ -254,6 +260,18 @@ void cluster_client_t::on_change_osd_state_hook(uint64_t peer_osd) } } +void cluster_client_t::on_ready(std::function fn) +{ + if (pgs_loaded) + { + fn(); + } + else + { + on_ready_hooks.push_back(fn); + } +} + /** * How writes are synced when immediate_commit is false * @@ -284,7 +302,7 @@ void cluster_client_t::on_change_osd_state_hook(uint64_t peer_osd) void cluster_client_t::execute(cluster_op_t *op) { - if (!bs_disk_alignment) + if (!pgs_loaded) { // We're offline offline_ops.push_back(op); diff --git a/cluster_client.h b/cluster_client.h index dbdc769c..c84941ab 100644 --- a/cluster_client.h +++ b/cluster_client.h @@ -63,8 +63,6 @@ class cluster_client_t int up_wait_retry_interval = 500; // ms uint64_t op_id = 1; - etcd_state_client_t st_cli; - osd_messenger_t msgr; ring_consumer_t consumer; // operations currently in progress std::set cur_ops; @@ -78,10 +76,17 @@ class cluster_client_t std::vector offline_ops; uint64_t queued_bytes = 0; + bool pgs_loaded = false; + std::vector> on_ready_hooks; + public: + etcd_state_client_t st_cli; + osd_messenger_t msgr; + cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd, json11::Json & config); ~cluster_client_t(); void execute(cluster_op_t *op); + void on_ready(std::function fn); void stop(); protected: diff --git a/nbd_proxy.cpp b/nbd_proxy.cpp index 9eca984d..879cf561 100644 --- a/nbd_proxy.cpp +++ b/nbd_proxy.cpp @@ -107,7 +107,7 @@ public: { printf( "Vitastor NBD proxy\n" - "(c) Vitaliy Filippov, 2020 (VNPL-1.0 or GNU GPL 2.0+)\n\n" + "(c) Vitaliy Filippov, 2020 (VNPL-1.0)\n\n" "USAGE:\n" " %s map --etcd_address --pool --inode --size \n" " %s unmap /dev/nbd0\n" diff --git a/rm_inode.cpp b/rm_inode.cpp new file mode 100644 index 00000000..de17fc8b --- /dev/null +++ b/rm_inode.cpp @@ -0,0 +1,325 @@ +// Copyright (c) Vitaliy Filippov, 2019+ +// License: VNPL-1.0 (see README.md for details) + +/** + * Inode removal tool + * May be included into a bigger "command-line management interface" in the future + */ + +#include + +#include "epoll_manager.h" +#include "cluster_client.h" +#include "pg_states.h" + +#define RM_NO_LIST 1 +#define RM_LIST_SENT 2 +#define RM_REMOVING 3 +#define RM_END 4 + +const char *exe_name = NULL; + +struct rm_pg_osd_t +{ + pg_num_t pg_num; + osd_num_t osd_num; + int state = 0; + obj_ver_id *obj_list = NULL; + uint64_t obj_count = 0, obj_pos = 0, obj_done = 0, obj_prev_done = 0; + int in_flight = 0; +}; + +class rm_inode_t +{ +protected: + uint64_t inode = 0, pool_id = 0; + uint64_t iodepth = 0, parallel_osds = 0; + + ring_loop_t *ringloop = NULL; + epoll_manager_t *epmgr = NULL; + cluster_client_t *cli = NULL; + ring_consumer_t consumer; + + std::vector lists; + uint64_t total_count = 0, total_done = 0, total_prev_pct = 0; + uint64_t pgs_to_list = 0; + bool started = false; + bool progress = true; + int log_level = 0; + +public: + static json11::Json::object parse_args(int narg, const char *args[]) + { + json11::Json::object cfg; + cfg["progress"] = "1"; + for (int i = 1; i < narg; i++) + { + if (!strcmp(args[i], "-h") || !strcmp(args[i], "--help")) + { + help(); + } + else if (args[i][0] == '-' && args[i][1] == '-') + { + const char *opt = args[i]+2; + cfg[opt] = !strcmp(opt, "json") || i == narg-1 ? "1" : args[++i]; + } + } + return cfg; + } + + static void help() + { + printf( + "Vitastor inode removal tool\n" + "(c) Vitaliy Filippov, 2020 (VNPL-1.0)\n\n" + "USAGE:\n" + " %s --etcd_address --pool --inode \n", + exe_name + ); + exit(0); + } + + void run(json11::Json cfg) + { + if (cfg["etcd_address"].string_value() == "") + { + fprintf(stderr, "etcd_address is missing\n"); + exit(1); + } + inode = cfg["inode"].uint64_value(); + pool_id = cfg["pool"].uint64_value(); + if (pool_id) + inode = (inode & ((1l << (64-POOL_ID_BITS)) - 1)) | (pool_id << (64-POOL_ID_BITS)); + pool_id = INODE_POOL(inode); + if (!pool_id) + { + fprintf(stderr, "pool is missing"); + exit(1); + } + iodepth = cfg["iodepth"].uint64_value(); + if (!iodepth) + iodepth = 32; + parallel_osds = cfg["parallel_osds"].uint64_value(); + if (!parallel_osds) + parallel_osds = 4; + log_level = cfg["log_level"].uint64_value(); + progress = cfg["progress"].uint64_value() ? true : false; + // Create client + ringloop = new ring_loop_t(512); + epmgr = new epoll_manager_t(ringloop); + cli = new cluster_client_t(ringloop, epmgr->tfd, cfg); + cli->on_ready([this]() { start_delete(); }); + // Initialize job + consumer.loop = [this]() + { + if (started) + continue_delete(); + ringloop->submit(); + }; + ringloop->register_consumer(&consumer); + // Loop until it completes + while (1) + { + ringloop->loop(); + ringloop->wait(); + } + } + + void start_delete() + { + if (cli->st_cli.pool_config.find(pool_id) == cli->st_cli.pool_config.end()) + { + fprintf(stderr, "Pool %lu does not exist\n", pool_id); + exit(1); + } + auto pool_cfg = cli->st_cli.pool_config[pool_id]; + for (auto & pg_item: pool_cfg.pg_config) + { + auto & pg = pg_item.second; + if (pg.pause || !pg.cur_primary || pg.cur_state != PG_ACTIVE) + { + // FIXME Support deletion in non-clean active PGs by introducing a "primary-list" command + fprintf(stderr, "PG %u is not active+clean, skipping\n", pg_item.first); + continue; + } + rm_pg_osd_t *r = new rm_pg_osd_t(); + r->pg_num = pg_item.first; + r->osd_num = pg.cur_primary; + r->state = RM_NO_LIST; + lists.push_back(r); + } + std::sort(lists.begin(), lists.end(), [](rm_pg_osd_t *a, rm_pg_osd_t *b) + { + return a->osd_num < b->osd_num ? true : false; + }); + pgs_to_list = lists.size(); + started = true; + continue_delete(); + } + + void send_list(rm_pg_osd_t *cur_list) + { + if (cli->msgr.osd_peer_fds.find(cur_list->osd_num) == + cli->msgr.osd_peer_fds.end()) + { + // Initiate connection + cli->msgr.connect_peer(cur_list->osd_num, cli->st_cli.peer_states[cur_list->osd_num]); + return; + } + osd_op_t *op = new osd_op_t(); + op->op_type = OSD_OP_OUT; + op->peer_fd = cli->msgr.osd_peer_fds[cur_list->osd_num]; + op->req = { + .sec_list = { + .header = { + .magic = SECONDARY_OSD_OP_MAGIC, + .id = cli->msgr.next_subop_id++, + .opcode = OSD_OP_SEC_LIST, + }, + .list_pg = cur_list->pg_num, + .pg_count = (pg_num_t)cli->st_cli.pool_config[pool_id].real_pg_count, + .pg_stripe_size = cli->st_cli.pool_config[pool_id].pg_stripe_size, + .min_inode = inode, + .max_inode = inode, + }, + }; + op->callback = [this, cur_list](osd_op_t *op) + { + pgs_to_list--; + if (op->reply.hdr.retval < 0) + { + fprintf(stderr, "Failed to get object list from OSD %lu (retval=%ld), skipping the PG\n", + cur_list->osd_num, op->reply.hdr.retval); + cli->msgr.stop_client(cur_list->osd_num); + delete op; + cur_list->state = RM_END; + continue_delete(); + return; + } + if (log_level > 0) + { + printf( + "[PG %u] Got inode object list from OSD %lu: %ld object versions\n", + cur_list->pg_num, cur_list->osd_num, op->reply.hdr.retval + ); + } + cur_list->obj_list = (obj_ver_id*)op->buf; + cur_list->obj_count = (uint64_t)op->reply.hdr.retval; + cur_list->obj_done = cur_list->obj_prev_done = cur_list->obj_pos = 0; + total_count += cur_list->obj_count; + total_prev_pct = 0; + // set op->buf to NULL so it doesn't get freed + op->buf = NULL; + delete op; + cur_list->state = RM_REMOVING; + continue_delete(); + }; + cur_list->state = RM_LIST_SENT; + cli->msgr.outbox_push(op); + } + + void send_ops(rm_pg_osd_t *cur_list) + { + if (cli->msgr.osd_peer_fds.find(cur_list->osd_num) == + cli->msgr.osd_peer_fds.end()) + { + // Initiate connection + cli->msgr.connect_peer(cur_list->osd_num, cli->st_cli.peer_states[cur_list->osd_num]); + return; + } + while (cur_list->in_flight < iodepth && cur_list->obj_pos < cur_list->obj_count) + { + osd_op_t *op = new osd_op_t(); + op->op_type = OSD_OP_OUT; + op->peer_fd = cli->msgr.osd_peer_fds[cur_list->osd_num]; + op->req = { + .rw = { + .header = { + .magic = SECONDARY_OSD_OP_MAGIC, + .id = cli->msgr.next_subop_id++, + .opcode = OSD_OP_DELETE, + }, + .inode = cur_list->obj_list[cur_list->obj_pos].oid.inode, + .offset = (cur_list->obj_list[cur_list->obj_pos].oid.stripe & ~STRIPE_MASK), + .len = 0, + }, + }; + op->callback = [this, cur_list](osd_op_t *op) + { + cur_list->in_flight--; + if (op->reply.hdr.retval < 0) + { + fprintf(stderr, "Failed to remove object from PG %u (OSD %lu) (retval=%ld)\n", + cur_list->pg_num, cur_list->osd_num, op->reply.hdr.retval); + } + delete op; + cur_list->obj_done++; + total_done++; + continue_delete(); + }; + cli->msgr.outbox_push(op); + cur_list->obj_pos++; + cur_list->in_flight++; + } + if (!cur_list->in_flight && cur_list->obj_pos >= cur_list->obj_count) + { + free(cur_list->obj_list); + cur_list->obj_list = NULL; + cur_list->obj_count = 0; + cur_list->obj_done = cur_list->obj_prev_done = cur_list->obj_pos = 0; + cur_list->state = RM_END; + } + } + + void continue_delete() + { + int par_osd = 0; + osd_num_t max_seen_osd = 0; + for (int i = 0; i < lists.size(); i++) + { + if (lists[i]->state == RM_END) + { + delete lists[i]; + lists.erase(lists.begin()+i, lists.begin()+i+1); + i--; + } + else if (lists[i]->osd_num > max_seen_osd) + { + if (lists[i]->state == RM_NO_LIST) + { + send_list(lists[i]); + } + else if (lists[i]->state == RM_REMOVING) + { + send_ops(lists[i]); + } + par_osd++; + max_seen_osd = lists[i]->osd_num; + if (par_osd >= parallel_osds) + { + break; + } + } + } + if (progress && total_count > 0 && total_done*1000/total_count != total_prev_pct) + { + printf("\rRemoved %lu/%lu objects, %lu more PGs to list...", total_done, total_count, pgs_to_list); + total_prev_pct = total_done*1000/total_count; + } + if (!lists.size()) + { + printf("Done, inode %lu in pool %lu removed\n", (inode & ((1l << (64-POOL_ID_BITS)) - 1)), pool_id); + exit(0); + } + } +}; + +int main(int narg, const char *args[]) +{ + setvbuf(stdout, NULL, _IONBF, 0); + setvbuf(stderr, NULL, _IONBF, 0); + exe_name = args[0]; + rm_inode_t *p = new rm_inode_t(); + p->run(rm_inode_t::parse_args(narg, args)); + return 0; +}