Implement inode removal tool. Removes multiple objects from multiple OSDs in parallel

Vitaliy Filippov 2020-10-09 21:56:47 +00:00
padre 3d05aa9362
commit 15dba96375
Se han modificado 6 ficheros con 369 adiciones y 13 borrados

Ver fichero

@ -2,7 +2,7 @@ BLOCKSTORE_OBJS := allocator.o blockstore.o blockstore_impl.o blockstore_init.o
blockstore_write.o blockstore_sync.o blockstore_stable.o blockstore_rollback.o blockstore_flush.o crc32c.o ringloop.o
# -fsanitize=address
CXXFLAGS := -g -O3 -Wall -Wno-sign-compare -Wno-comment -Wno-parentheses -Wno-pointer-arith -fPIC -fdiagnostics-color=always
all: libfio_blockstore.so osd libfio_sec_osd.so libfio_cluster.so stub_osd stub_uring_osd stub_bench osd_test dump_journal qemu_driver.so nbd_proxy
all: libfio_blockstore.so osd libfio_sec_osd.so libfio_cluster.so stub_osd stub_uring_osd stub_bench osd_test dump_journal qemu_driver.so nbd_proxy rm_inode
clean:
rm -f *.o
@ -44,6 +44,9 @@ libfio_cluster.so: fio_cluster.o $(FIO_CLUSTER_OBJS)
nbd_proxy: nbd_proxy.o $(FIO_CLUSTER_OBJS)
g++ $(CXXFLAGS) -ltcmalloc_minimal -o $@ $< $(FIO_CLUSTER_OBJS) -luring
rm_inode: rm_inode.o $(FIO_CLUSTER_OBJS)
g++ $(CXXFLAGS) -ltcmalloc_minimal -o $@ $< $(FIO_CLUSTER_OBJS) -luring
qemu_driver.o: qemu_driver.c qemu_proxy.h
gcc -I qemu/b/qemu `pkg-config glib-2.0 --cflags` \
-I qemu/include $(CXXFLAGS) -c -o $@ $<
@ -149,6 +152,8 @@ qemu_proxy.o: qemu_proxy.cpp cluster_client.h etcd_state_client.h http_client.h
g++ $(CXXFLAGS) -c -o $@ $<
ringloop.o: ringloop.cpp ringloop.h
g++ $(CXXFLAGS) -c -o $@ $<
rm_inode.o: rm_inode.cpp cluster_client.h etcd_state_client.h http_client.h json11/json11.hpp malloc_or_die.h messenger.h object_id.h osd_id.h osd_ops.h ringloop.h timerfd_manager.h
g++ $(CXXFLAGS) -c -o $@ $<
rw_blocking.o: rw_blocking.cpp rw_blocking.h
g++ $(CXXFLAGS) -c -o $@ $<
stub_bench.o: stub_bench.cpp object_id.h osd_id.h osd_ops.h rw_blocking.h

Ver fichero

@ -31,12 +31,12 @@ breaking changes in the future. However, the following is implemented:
- QEMU driver (built out-of-tree)
- Loadable fio engine for benchmarks (also built out-of-tree)
- NBD proxy for kernel mounts
- Inode removal tool (./rm_inode)
## Roadmap
- Packaging for Debian and, probably, CentOS too
- OSD creation tool (OSDs currently have to be created by hand)
- Inode deletion tool (currently you can't delete anything :))
- Other administrative tools
- Per-inode I/O and space usage statistics
- jerasure EC support with any number of data and parity drives in a group
@ -333,19 +333,22 @@ and calculate disk offsets almost by hand. This will be fixed in near future.
-device virtio-blk-pci,scsi=off,bus=pci.0,addr=0x5,drive=drive-virtio-disk0,id=virtio-disk0,bootindex=1,write-cache=off,physical_block_size=4096,logical_block_size=512
-vnc 0.0.0.0:0
```
- Remove inode with (for example):
```
./rm_inode --etcd_address 10.115.0.10:2379/v3 --pool 1 --inode 1 --parallel_osds 16 --iodepth 32
```
## Known Problems
- OSDs may currently crash with "can't get SQE, will fall out of sync with EPOLLET"
if you try to load them with very long iodepths because io_uring queue (ring) is limited
and OSDs don't check if it fills up.
- Object deletion requests may currently lead to unfound objects on crashes because
proper handling of deletions in a cluster requires a "three-phase cleanup process"
and it's currently not implemented. In fact, even though deletion requests are
implemented, there's no user tool to delete anything from the cluster yet :).
Of course I'll create such tool, but its first implementation will be vulnerable to this issue.
It's not a big deal though, because you'll be able to just repeat the deletion request
in this case.
- Object deletion requests may currently lead to 'incomplete' objects if your OSDs crash during
deletion because proper handling of object cleanup in a cluster should be "three-phase"
and it's currently not implemented. Inode removal tool currently can't handle unclean
objects, so incomplete objects become undeletable. This will be fixed in near future
by allowing the inode removal tool to delete unclean objects. With this problem fixed
you'll be able just to repeat the removal again.
## Implementation Principles

Ver fichero

@ -203,6 +203,12 @@ void cluster_client_t::on_load_pgs_hook(bool success)
{
pg_counts[pool_item.first] = pool_item.second.real_pg_count;
}
pgs_loaded = true;
for (auto fn: on_ready_hooks)
{
fn();
}
on_ready_hooks.clear();
for (auto op: offline_ops)
{
execute(op);
@ -254,6 +260,18 @@ void cluster_client_t::on_change_osd_state_hook(uint64_t peer_osd)
}
}
void cluster_client_t::on_ready(std::function<void(void)> fn)
{
if (pgs_loaded)
{
fn();
}
else
{
on_ready_hooks.push_back(fn);
}
}
/**
* How writes are synced when immediate_commit is false
*
@ -284,7 +302,7 @@ void cluster_client_t::on_change_osd_state_hook(uint64_t peer_osd)
void cluster_client_t::execute(cluster_op_t *op)
{
if (!bs_disk_alignment)
if (!pgs_loaded)
{
// We're offline
offline_ops.push_back(op);

Ver fichero

@ -63,8 +63,6 @@ class cluster_client_t
int up_wait_retry_interval = 500; // ms
uint64_t op_id = 1;
etcd_state_client_t st_cli;
osd_messenger_t msgr;
ring_consumer_t consumer;
// operations currently in progress
std::set<cluster_op_t*> cur_ops;
@ -78,10 +76,17 @@ class cluster_client_t
std::vector<cluster_op_t*> offline_ops;
uint64_t queued_bytes = 0;
bool pgs_loaded = false;
std::vector<std::function<void(void)>> on_ready_hooks;
public:
etcd_state_client_t st_cli;
osd_messenger_t msgr;
cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd, json11::Json & config);
~cluster_client_t();
void execute(cluster_op_t *op);
void on_ready(std::function<void(void)> fn);
void stop();
protected:

Ver fichero

@ -107,7 +107,7 @@ public:
{
printf(
"Vitastor NBD proxy\n"
"(c) Vitaliy Filippov, 2020 (VNPL-1.0 or GNU GPL 2.0+)\n\n"
"(c) Vitaliy Filippov, 2020 (VNPL-1.0)\n\n"
"USAGE:\n"
" %s map --etcd_address <etcd_address> --pool <pool> --inode <inode> --size <size in bytes>\n"
" %s unmap /dev/nbd0\n"

325
rm_inode.cpp Normal file
Ver fichero

@ -0,0 +1,325 @@
// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.0 (see README.md for details)
/**
* Inode removal tool
* May be included into a bigger "command-line management interface" in the future
*/
#include <algorithm>
#include "epoll_manager.h"
#include "cluster_client.h"
#include "pg_states.h"
#define RM_NO_LIST 1
#define RM_LIST_SENT 2
#define RM_REMOVING 3
#define RM_END 4
const char *exe_name = NULL;
struct rm_pg_osd_t
{
pg_num_t pg_num;
osd_num_t osd_num;
int state = 0;
obj_ver_id *obj_list = NULL;
uint64_t obj_count = 0, obj_pos = 0, obj_done = 0, obj_prev_done = 0;
int in_flight = 0;
};
class rm_inode_t
{
protected:
uint64_t inode = 0, pool_id = 0;
uint64_t iodepth = 0, parallel_osds = 0;
ring_loop_t *ringloop = NULL;
epoll_manager_t *epmgr = NULL;
cluster_client_t *cli = NULL;
ring_consumer_t consumer;
std::vector<rm_pg_osd_t*> lists;
uint64_t total_count = 0, total_done = 0, total_prev_pct = 0;
uint64_t pgs_to_list = 0;
bool started = false;
bool progress = true;
int log_level = 0;
public:
static json11::Json::object parse_args(int narg, const char *args[])
{
json11::Json::object cfg;
cfg["progress"] = "1";
for (int i = 1; i < narg; i++)
{
if (!strcmp(args[i], "-h") || !strcmp(args[i], "--help"))
{
help();
}
else if (args[i][0] == '-' && args[i][1] == '-')
{
const char *opt = args[i]+2;
cfg[opt] = !strcmp(opt, "json") || i == narg-1 ? "1" : args[++i];
}
}
return cfg;
}
static void help()
{
printf(
"Vitastor inode removal tool\n"
"(c) Vitaliy Filippov, 2020 (VNPL-1.0)\n\n"
"USAGE:\n"
" %s --etcd_address <etcd_address> --pool <pool> --inode <inode>\n",
exe_name
);
exit(0);
}
void run(json11::Json cfg)
{
if (cfg["etcd_address"].string_value() == "")
{
fprintf(stderr, "etcd_address is missing\n");
exit(1);
}
inode = cfg["inode"].uint64_value();
pool_id = cfg["pool"].uint64_value();
if (pool_id)
inode = (inode & ((1l << (64-POOL_ID_BITS)) - 1)) | (pool_id << (64-POOL_ID_BITS));
pool_id = INODE_POOL(inode);
if (!pool_id)
{
fprintf(stderr, "pool is missing");
exit(1);
}
iodepth = cfg["iodepth"].uint64_value();
if (!iodepth)
iodepth = 32;
parallel_osds = cfg["parallel_osds"].uint64_value();
if (!parallel_osds)
parallel_osds = 4;
log_level = cfg["log_level"].uint64_value();
progress = cfg["progress"].uint64_value() ? true : false;
// Create client
ringloop = new ring_loop_t(512);
epmgr = new epoll_manager_t(ringloop);
cli = new cluster_client_t(ringloop, epmgr->tfd, cfg);
cli->on_ready([this]() { start_delete(); });
// Initialize job
consumer.loop = [this]()
{
if (started)
continue_delete();
ringloop->submit();
};
ringloop->register_consumer(&consumer);
// Loop until it completes
while (1)
{
ringloop->loop();
ringloop->wait();
}
}
void start_delete()
{
if (cli->st_cli.pool_config.find(pool_id) == cli->st_cli.pool_config.end())
{
fprintf(stderr, "Pool %lu does not exist\n", pool_id);
exit(1);
}
auto pool_cfg = cli->st_cli.pool_config[pool_id];
for (auto & pg_item: pool_cfg.pg_config)
{
auto & pg = pg_item.second;
if (pg.pause || !pg.cur_primary || pg.cur_state != PG_ACTIVE)
{
// FIXME Support deletion in non-clean active PGs by introducing a "primary-list" command
fprintf(stderr, "PG %u is not active+clean, skipping\n", pg_item.first);
continue;
}
rm_pg_osd_t *r = new rm_pg_osd_t();
r->pg_num = pg_item.first;
r->osd_num = pg.cur_primary;
r->state = RM_NO_LIST;
lists.push_back(r);
}
std::sort(lists.begin(), lists.end(), [](rm_pg_osd_t *a, rm_pg_osd_t *b)
{
return a->osd_num < b->osd_num ? true : false;
});
pgs_to_list = lists.size();
started = true;
continue_delete();
}
void send_list(rm_pg_osd_t *cur_list)
{
if (cli->msgr.osd_peer_fds.find(cur_list->osd_num) ==
cli->msgr.osd_peer_fds.end())
{
// Initiate connection
cli->msgr.connect_peer(cur_list->osd_num, cli->st_cli.peer_states[cur_list->osd_num]);
return;
}
osd_op_t *op = new osd_op_t();
op->op_type = OSD_OP_OUT;
op->peer_fd = cli->msgr.osd_peer_fds[cur_list->osd_num];
op->req = {
.sec_list = {
.header = {
.magic = SECONDARY_OSD_OP_MAGIC,
.id = cli->msgr.next_subop_id++,
.opcode = OSD_OP_SEC_LIST,
},
.list_pg = cur_list->pg_num,
.pg_count = (pg_num_t)cli->st_cli.pool_config[pool_id].real_pg_count,
.pg_stripe_size = cli->st_cli.pool_config[pool_id].pg_stripe_size,
.min_inode = inode,
.max_inode = inode,
},
};
op->callback = [this, cur_list](osd_op_t *op)
{
pgs_to_list--;
if (op->reply.hdr.retval < 0)
{
fprintf(stderr, "Failed to get object list from OSD %lu (retval=%ld), skipping the PG\n",
cur_list->osd_num, op->reply.hdr.retval);
cli->msgr.stop_client(cur_list->osd_num);
delete op;
cur_list->state = RM_END;
continue_delete();
return;
}
if (log_level > 0)
{
printf(
"[PG %u] Got inode object list from OSD %lu: %ld object versions\n",
cur_list->pg_num, cur_list->osd_num, op->reply.hdr.retval
);
}
cur_list->obj_list = (obj_ver_id*)op->buf;
cur_list->obj_count = (uint64_t)op->reply.hdr.retval;
cur_list->obj_done = cur_list->obj_prev_done = cur_list->obj_pos = 0;
total_count += cur_list->obj_count;
total_prev_pct = 0;
// set op->buf to NULL so it doesn't get freed
op->buf = NULL;
delete op;
cur_list->state = RM_REMOVING;
continue_delete();
};
cur_list->state = RM_LIST_SENT;
cli->msgr.outbox_push(op);
}
void send_ops(rm_pg_osd_t *cur_list)
{
if (cli->msgr.osd_peer_fds.find(cur_list->osd_num) ==
cli->msgr.osd_peer_fds.end())
{
// Initiate connection
cli->msgr.connect_peer(cur_list->osd_num, cli->st_cli.peer_states[cur_list->osd_num]);
return;
}
while (cur_list->in_flight < iodepth && cur_list->obj_pos < cur_list->obj_count)
{
osd_op_t *op = new osd_op_t();
op->op_type = OSD_OP_OUT;
op->peer_fd = cli->msgr.osd_peer_fds[cur_list->osd_num];
op->req = {
.rw = {
.header = {
.magic = SECONDARY_OSD_OP_MAGIC,
.id = cli->msgr.next_subop_id++,
.opcode = OSD_OP_DELETE,
},
.inode = cur_list->obj_list[cur_list->obj_pos].oid.inode,
.offset = (cur_list->obj_list[cur_list->obj_pos].oid.stripe & ~STRIPE_MASK),
.len = 0,
},
};
op->callback = [this, cur_list](osd_op_t *op)
{
cur_list->in_flight--;
if (op->reply.hdr.retval < 0)
{
fprintf(stderr, "Failed to remove object from PG %u (OSD %lu) (retval=%ld)\n",
cur_list->pg_num, cur_list->osd_num, op->reply.hdr.retval);
}
delete op;
cur_list->obj_done++;
total_done++;
continue_delete();
};
cli->msgr.outbox_push(op);
cur_list->obj_pos++;
cur_list->in_flight++;
}
if (!cur_list->in_flight && cur_list->obj_pos >= cur_list->obj_count)
{
free(cur_list->obj_list);
cur_list->obj_list = NULL;
cur_list->obj_count = 0;
cur_list->obj_done = cur_list->obj_prev_done = cur_list->obj_pos = 0;
cur_list->state = RM_END;
}
}
void continue_delete()
{
int par_osd = 0;
osd_num_t max_seen_osd = 0;
for (int i = 0; i < lists.size(); i++)
{
if (lists[i]->state == RM_END)
{
delete lists[i];
lists.erase(lists.begin()+i, lists.begin()+i+1);
i--;
}
else if (lists[i]->osd_num > max_seen_osd)
{
if (lists[i]->state == RM_NO_LIST)
{
send_list(lists[i]);
}
else if (lists[i]->state == RM_REMOVING)
{
send_ops(lists[i]);
}
par_osd++;
max_seen_osd = lists[i]->osd_num;
if (par_osd >= parallel_osds)
{
break;
}
}
}
if (progress && total_count > 0 && total_done*1000/total_count != total_prev_pct)
{
printf("\rRemoved %lu/%lu objects, %lu more PGs to list...", total_done, total_count, pgs_to_list);
total_prev_pct = total_done*1000/total_count;
}
if (!lists.size())
{
printf("Done, inode %lu in pool %lu removed\n", (inode & ((1l << (64-POOL_ID_BITS)) - 1)), pool_id);
exit(0);
}
}
};
int main(int narg, const char *args[])
{
setvbuf(stdout, NULL, _IONBF, 0);
setvbuf(stderr, NULL, _IONBF, 0);
exe_name = args[0];
rm_inode_t *p = new rm_inode_t();
p->run(rm_inode_t::parse_args(narg, args));
return 0;
}