diff --git a/README.md b/README.md index 2629a6c2..6f733b1c 100644 --- a/README.md +++ b/README.md @@ -399,10 +399,7 @@ and calculate disk offsets almost by hand. This will be fixed in near future. - Object deletion requests may currently lead to 'incomplete' objects if your OSDs crash during deletion because proper handling of object cleanup in a cluster should be "three-phase" - and it's currently not implemented. Inode removal tool currently can't handle unclean - objects, so incomplete objects become undeletable. This will be fixed in near future - by allowing the inode removal tool to delete unclean objects. With this problem fixed - you'll be able just to repeat the removal again. + and it's currently not implemented. Just to repeat the removal again in this case. ## Implementation Principles diff --git a/osd_peering_pg.h b/osd_peering_pg.h index f1829d5f..bdea6958 100644 --- a/osd_peering_pg.h +++ b/osd_peering_pg.h @@ -94,7 +94,7 @@ struct pg_t std::vector cur_set; // same thing in state_dict-like format pg_osd_set_t cur_loc_set; - // moved object map. by default, each object is considered to reside on the cur_set. + // moved object map. by default, each object is considered to reside on cur_set. // this map stores all objects that differ. // it may consume up to ~ (raw storage / object size) * 24 bytes in the worst case scenario // which is up to ~192 MB per 1 TB in the worst case scenario diff --git a/rm_inode.cpp b/rm_inode.cpp index d872073c..e1ff14a3 100644 --- a/rm_inode.cpp +++ b/rm_inode.cpp @@ -6,26 +6,38 @@ * May be included into a bigger "command-line management interface" in the future */ +#include #include #include "epoll_manager.h" #include "cluster_client.h" #include "pg_states.h" -#define RM_NO_LIST 1 -#define RM_LIST_SENT 2 -#define RM_REMOVING 3 -#define RM_END 4 +#define RM_LISTING 1 +#define RM_REMOVING 2 +#define RM_END 3 const char *exe_name = NULL; +struct rm_pg_t; + struct rm_pg_osd_t { - pg_num_t pg_num; + rm_pg_t *pg = NULL; osd_num_t osd_num; + bool sent = false; +}; + +struct rm_pg_t +{ + pg_num_t pg_num; + osd_num_t rm_osd_num; + std::vector list_osds; int state = 0; - obj_ver_id *obj_list = NULL; - uint64_t obj_count = 0, obj_pos = 0, obj_done = 0, obj_prev_done = 0; + int to_list; + std::set objects; + std::set::iterator obj_pos; + uint64_t obj_count = 0, obj_done = 0, obj_prev_done = 0; int in_flight = 0; }; @@ -41,11 +53,12 @@ protected: cluster_client_t *cli = NULL; ring_consumer_t consumer; - std::vector lists; + std::vector lists; uint64_t total_count = 0, total_done = 0, total_prev_pct = 0; uint64_t pgs_to_list = 0; bool started = false; bool progress = true; + bool list_first = false; int log_level = 0; public: @@ -62,7 +75,7 @@ public: else if (args[i][0] == '-' && args[i][1] == '-') { const char *opt = args[i]+2; - cfg[opt] = !strcmp(opt, "json") || i == narg-1 ? "1" : args[++i]; + cfg[opt] = !strcmp(opt, "json") || !strcmp(opt, "wait-list") || i == narg-1 ? "1" : args[++i]; } } return cfg; @@ -74,7 +87,7 @@ public: "Vitastor inode removal tool\n" "(c) Vitaliy Filippov, 2020 (VNPL-1.0)\n\n" "USAGE:\n" - " %s --etcd_address --pool --inode \n", + " %s --etcd_address --pool --inode [--wait-list]\n", exe_name ); exit(0); @@ -105,6 +118,7 @@ public: parallel_osds = 4; log_level = cfg["log_level"].int64_value(); progress = cfg["progress"].uint64_value() ? true : false; + list_first = cfg["wait-list"].uint64_value() ? true : false; // Create client ringloop = new ring_loop_t(512); epmgr = new epoll_manager_t(ringloop); @@ -137,21 +151,57 @@ public: for (auto & pg_item: pool_cfg.pg_config) { auto & pg = pg_item.second; - if (pg.pause || !pg.cur_primary || pg.cur_state != PG_ACTIVE) + if (pg.pause || !pg.cur_primary || !(pg.cur_state & PG_ACTIVE)) { - // FIXME Support deletion in non-clean active PGs by introducing a "primary-list" command - fprintf(stderr, "PG %u is not active+clean, skipping\n", pg_item.first); + fprintf(stderr, "PG %u is inactive, skipping\n", pg_item.first); continue; } - rm_pg_osd_t *r = new rm_pg_osd_t(); + rm_pg_t *r = new rm_pg_t(); r->pg_num = pg_item.first; - r->osd_num = pg.cur_primary; - r->state = RM_NO_LIST; + r->rm_osd_num = pg.cur_primary; + r->state = RM_LISTING; + if (pg.cur_state != PG_ACTIVE) + { + std::set all_peers; + for (osd_num_t pg_osd: pg.target_set) + { + if (pg_osd != 0) + { + all_peers.insert(pg_osd); + } + } + for (osd_num_t pg_osd: pg.all_peers) + { + if (pg_osd != 0) + { + all_peers.insert(pg_osd); + } + } + for (auto & hist_item: pg.target_history) + { + for (auto pg_osd: hist_item) + { + if (pg_osd != 0) + { + all_peers.insert(pg_osd); + } + } + } + for (osd_num_t peer_osd: all_peers) + { + r->list_osds.push_back((rm_pg_osd_t){ .pg = r, .osd_num = peer_osd, .sent = false }); + } + } + else + { + r->list_osds.push_back((rm_pg_osd_t){ .pg = r, .osd_num = pg.cur_primary, .sent = false }); + } + r->to_list = r->list_osds.size(); lists.push_back(r); } - std::sort(lists.begin(), lists.end(), [](rm_pg_osd_t *a, rm_pg_osd_t *b) + std::sort(lists.begin(), lists.end(), [](rm_pg_t *a, rm_pg_t *b) { - return a->osd_num < b->osd_num ? true : false; + return a->rm_osd_num < b->rm_osd_num ? true : false; }); pgs_to_list = lists.size(); started = true; @@ -160,6 +210,10 @@ public: void send_list(rm_pg_osd_t *cur_list) { + if (cur_list->sent) + { + return; + } if (cli->msgr.osd_peer_fds.find(cur_list->osd_num) == cli->msgr.osd_peer_fds.end()) { @@ -177,7 +231,7 @@ public: .id = cli->msgr.next_subop_id++, .opcode = OSD_OP_SEC_LIST, }, - .list_pg = cur_list->pg_num, + .list_pg = cur_list->pg->pg_num, .pg_count = (pg_num_t)cli->st_cli.pool_config[pool_id].real_pg_count, .pg_stripe_size = cli->st_cli.pool_config[pool_id].pg_stripe_size, .min_inode = inode, @@ -186,53 +240,67 @@ public: }; op->callback = [this, cur_list](osd_op_t *op) { - pgs_to_list--; + cur_list->pg->to_list--; if (op->reply.hdr.retval < 0) { - fprintf(stderr, "Failed to get object list from OSD %lu (retval=%ld), skipping the PG\n", - cur_list->osd_num, op->reply.hdr.retval); - cli->msgr.stop_client(cur_list->osd_num); - delete op; - cur_list->state = RM_END; - continue_delete(); - return; + fprintf(stderr, "Failed to get PG %u/%u object list from OSD %lu (retval=%ld), skipping\n", + pool_id, cur_list->pg->pg_num, cur_list->osd_num, op->reply.hdr.retval); } - if (log_level > 0) + else { - printf( - "[PG %u/%u] Got inode object list from OSD %lu: %ld object versions\n", - pool_id, cur_list->pg_num, cur_list->osd_num, op->reply.hdr.retval - ); + if (op->reply.sec_list.stable_count < op->reply.hdr.retval) + { + // Unstable objects, if present, mean that someone still writes into the inode. Warn the user about it. + printf( + "[PG %u/%u] Inode still has %lu unstable object versions - is it still open? Not a good idea to delete it.\n", + pool_id, cur_list->pg->pg_num, op->reply.hdr.retval - op->reply.sec_list.stable_count + ); + } + if (log_level > 0) + { + printf( + "[PG %u/%u] Got inode object list from OSD %lu: %ld object versions\n", + pool_id, cur_list->pg->pg_num, cur_list->osd_num, op->reply.hdr.retval + ); + } + for (uint64_t i = 0; i < op->reply.hdr.retval; i++) + { + object_id oid = ((obj_ver_id*)op->buf)[i].oid; + oid.stripe = oid.stripe & ~STRIPE_MASK; + cur_list->pg->objects.insert(oid); + } } - cur_list->obj_list = (obj_ver_id*)op->buf; - cur_list->obj_count = (uint64_t)op->reply.hdr.retval; - cur_list->obj_done = cur_list->obj_prev_done = cur_list->obj_pos = 0; - total_count += cur_list->obj_count; - total_prev_pct = 0; - // set op->buf to NULL so it doesn't get freed - op->buf = NULL; delete op; - cur_list->state = RM_REMOVING; + if (cur_list->pg->to_list <= 0) + { + cur_list->pg->obj_done = cur_list->pg->obj_prev_done = 0; + cur_list->pg->obj_pos = cur_list->pg->objects.begin(); + cur_list->pg->obj_count = cur_list->pg->objects.size(); + total_count += cur_list->pg->obj_count; + total_prev_pct = 0; + cur_list->pg->state = RM_REMOVING; + pgs_to_list--; + } continue_delete(); }; - cur_list->state = RM_LIST_SENT; cli->msgr.outbox_push(op); + cur_list->sent = true; } - void send_ops(rm_pg_osd_t *cur_list) + void send_ops(rm_pg_t *cur_list) { - if (cli->msgr.osd_peer_fds.find(cur_list->osd_num) == + if (cli->msgr.osd_peer_fds.find(cur_list->rm_osd_num) == cli->msgr.osd_peer_fds.end()) { // Initiate connection - cli->msgr.connect_peer(cur_list->osd_num, cli->st_cli.peer_states[cur_list->osd_num]); + cli->msgr.connect_peer(cur_list->rm_osd_num, cli->st_cli.peer_states[cur_list->rm_osd_num]); return; } - while (cur_list->in_flight < iodepth && cur_list->obj_pos < cur_list->obj_count) + while (cur_list->in_flight < iodepth && cur_list->obj_pos != cur_list->objects.end()) { osd_op_t *op = new osd_op_t(); op->op_type = OSD_OP_OUT; - op->peer_fd = cli->msgr.osd_peer_fds[cur_list->osd_num]; + op->peer_fd = cli->msgr.osd_peer_fds[cur_list->rm_osd_num]; op->req = (osd_any_op_t){ .rw = { .header = { @@ -240,8 +308,8 @@ public: .id = cli->msgr.next_subop_id++, .opcode = OSD_OP_DELETE, }, - .inode = cur_list->obj_list[cur_list->obj_pos].oid.inode, - .offset = (cur_list->obj_list[cur_list->obj_pos].oid.stripe & ~STRIPE_MASK), + .inode = cur_list->obj_pos->inode, + .offset = (cur_list->obj_pos->stripe & ~STRIPE_MASK), .len = 0, }, }; @@ -251,7 +319,7 @@ public: if (op->reply.hdr.retval < 0) { fprintf(stderr, "Failed to remove object from PG %u (OSD %lu) (retval=%ld)\n", - cur_list->pg_num, cur_list->osd_num, op->reply.hdr.retval); + cur_list->pg_num, cur_list->rm_osd_num, op->reply.hdr.retval); } delete op; cur_list->obj_done++; @@ -262,12 +330,10 @@ public: cur_list->obj_pos++; cur_list->in_flight++; } - if (!cur_list->in_flight && cur_list->obj_pos >= cur_list->obj_count) + if (!cur_list->in_flight && cur_list->obj_pos == cur_list->objects.end()) { - free(cur_list->obj_list); - cur_list->obj_list = NULL; cur_list->obj_count = 0; - cur_list->obj_done = cur_list->obj_prev_done = cur_list->obj_pos = 0; + cur_list->obj_done = cur_list->obj_prev_done = 0; cur_list->state = RM_END; } } @@ -276,6 +342,22 @@ public: { int par_osd = 0; osd_num_t max_seen_osd = 0; + bool no_del = false; + if (list_first) + { + int i, n = 0; + for (i = 0; i < lists.size(); i++) + { + if (lists[i]->state == RM_LISTING) + { + n++; + } + } + if (n > 0) + { + no_del = true; + } + } for (int i = 0; i < lists.size(); i++) { if (lists[i]->state == RM_END) @@ -284,18 +366,25 @@ public: lists.erase(lists.begin()+i, lists.begin()+i+1); i--; } - else if (lists[i]->osd_num > max_seen_osd) + else if (lists[i]->rm_osd_num > max_seen_osd) { - if (lists[i]->state == RM_NO_LIST) + if (lists[i]->state == RM_LISTING) { - send_list(lists[i]); + for (int j = 0; j < lists[i]->list_osds.size(); j++) + { + send_list(&lists[i]->list_osds[j]); + } } else if (lists[i]->state == RM_REMOVING) { + if (no_del) + { + continue; + } send_ops(lists[i]); } par_osd++; - max_seen_osd = lists[i]->osd_num; + max_seen_osd = lists[i]->rm_osd_num; if (par_osd >= parallel_osds) { break;