Make rm_inode work with incomplete and degraded objects, allow to wait before deleting objects

Vitaliy Filippov 2020-12-28 02:07:53 +03:00
parent ccabbbfbcb
commit 4a17a61d1f
3 changed files with 148 additions and 62 deletions

View File

@ -399,10 +399,7 @@ and calculate disk offsets almost by hand. This will be fixed in near future.
- Object deletion requests may currently lead to 'incomplete' objects if your OSDs crash during - Object deletion requests may currently lead to 'incomplete' objects if your OSDs crash during
deletion because proper handling of object cleanup in a cluster should be "three-phase" deletion because proper handling of object cleanup in a cluster should be "three-phase"
and it's currently not implemented. Inode removal tool currently can't handle unclean and it's currently not implemented. Just to repeat the removal again in this case.
objects, so incomplete objects become undeletable. This will be fixed in near future
by allowing the inode removal tool to delete unclean objects. With this problem fixed
you'll be able just to repeat the removal again.
## Implementation Principles ## Implementation Principles

View File

@ -94,7 +94,7 @@ struct pg_t
std::vector<osd_num_t> cur_set; std::vector<osd_num_t> cur_set;
// same thing in state_dict-like format // same thing in state_dict-like format
pg_osd_set_t cur_loc_set; pg_osd_set_t cur_loc_set;
// moved object map. by default, each object is considered to reside on the cur_set. // moved object map. by default, each object is considered to reside on cur_set.
// this map stores all objects that differ. // this map stores all objects that differ.
// it may consume up to ~ (raw storage / object size) * 24 bytes in the worst case scenario // it may consume up to ~ (raw storage / object size) * 24 bytes in the worst case scenario
// which is up to ~192 MB per 1 TB in the worst case scenario // which is up to ~192 MB per 1 TB in the worst case scenario

View File

@ -6,26 +6,38 @@
* May be included into a bigger "command-line management interface" in the future * May be included into a bigger "command-line management interface" in the future
*/ */
#include <vector>
#include <algorithm> #include <algorithm>
#include "epoll_manager.h" #include "epoll_manager.h"
#include "cluster_client.h" #include "cluster_client.h"
#include "pg_states.h" #include "pg_states.h"
#define RM_NO_LIST 1 #define RM_LISTING 1
#define RM_LIST_SENT 2 #define RM_REMOVING 2
#define RM_REMOVING 3 #define RM_END 3
#define RM_END 4
const char *exe_name = NULL; const char *exe_name = NULL;
struct rm_pg_t;
struct rm_pg_osd_t struct rm_pg_osd_t
{ {
pg_num_t pg_num; rm_pg_t *pg = NULL;
osd_num_t osd_num; osd_num_t osd_num;
bool sent = false;
};
struct rm_pg_t
{
pg_num_t pg_num;
osd_num_t rm_osd_num;
std::vector<rm_pg_osd_t> list_osds;
int state = 0; int state = 0;
obj_ver_id *obj_list = NULL; int to_list;
uint64_t obj_count = 0, obj_pos = 0, obj_done = 0, obj_prev_done = 0; std::set<object_id> objects;
std::set<object_id>::iterator obj_pos;
uint64_t obj_count = 0, obj_done = 0, obj_prev_done = 0;
int in_flight = 0; int in_flight = 0;
}; };
@ -41,11 +53,12 @@ protected:
cluster_client_t *cli = NULL; cluster_client_t *cli = NULL;
ring_consumer_t consumer; ring_consumer_t consumer;
std::vector<rm_pg_osd_t*> lists; std::vector<rm_pg_t*> lists;
uint64_t total_count = 0, total_done = 0, total_prev_pct = 0; uint64_t total_count = 0, total_done = 0, total_prev_pct = 0;
uint64_t pgs_to_list = 0; uint64_t pgs_to_list = 0;
bool started = false; bool started = false;
bool progress = true; bool progress = true;
bool list_first = false;
int log_level = 0; int log_level = 0;
public: public:
@ -62,7 +75,7 @@ public:
else if (args[i][0] == '-' && args[i][1] == '-') else if (args[i][0] == '-' && args[i][1] == '-')
{ {
const char *opt = args[i]+2; const char *opt = args[i]+2;
cfg[opt] = !strcmp(opt, "json") || i == narg-1 ? "1" : args[++i]; cfg[opt] = !strcmp(opt, "json") || !strcmp(opt, "wait-list") || i == narg-1 ? "1" : args[++i];
} }
} }
return cfg; return cfg;
@ -74,7 +87,7 @@ public:
"Vitastor inode removal tool\n" "Vitastor inode removal tool\n"
"(c) Vitaliy Filippov, 2020 (VNPL-1.0)\n\n" "(c) Vitaliy Filippov, 2020 (VNPL-1.0)\n\n"
"USAGE:\n" "USAGE:\n"
" %s --etcd_address <etcd_address> --pool <pool> --inode <inode>\n", " %s --etcd_address <etcd_address> --pool <pool> --inode <inode> [--wait-list]\n",
exe_name exe_name
); );
exit(0); exit(0);
@ -105,6 +118,7 @@ public:
parallel_osds = 4; parallel_osds = 4;
log_level = cfg["log_level"].int64_value(); log_level = cfg["log_level"].int64_value();
progress = cfg["progress"].uint64_value() ? true : false; progress = cfg["progress"].uint64_value() ? true : false;
list_first = cfg["wait-list"].uint64_value() ? true : false;
// Create client // Create client
ringloop = new ring_loop_t(512); ringloop = new ring_loop_t(512);
epmgr = new epoll_manager_t(ringloop); epmgr = new epoll_manager_t(ringloop);
@ -137,21 +151,57 @@ public:
for (auto & pg_item: pool_cfg.pg_config) for (auto & pg_item: pool_cfg.pg_config)
{ {
auto & pg = pg_item.second; auto & pg = pg_item.second;
if (pg.pause || !pg.cur_primary || pg.cur_state != PG_ACTIVE) if (pg.pause || !pg.cur_primary || !(pg.cur_state & PG_ACTIVE))
{ {
// FIXME Support deletion in non-clean active PGs by introducing a "primary-list" command fprintf(stderr, "PG %u is inactive, skipping\n", pg_item.first);
fprintf(stderr, "PG %u is not active+clean, skipping\n", pg_item.first);
continue; continue;
} }
rm_pg_osd_t *r = new rm_pg_osd_t(); rm_pg_t *r = new rm_pg_t();
r->pg_num = pg_item.first; r->pg_num = pg_item.first;
r->osd_num = pg.cur_primary; r->rm_osd_num = pg.cur_primary;
r->state = RM_NO_LIST; r->state = RM_LISTING;
if (pg.cur_state != PG_ACTIVE)
{
std::set<osd_num_t> all_peers;
for (osd_num_t pg_osd: pg.target_set)
{
if (pg_osd != 0)
{
all_peers.insert(pg_osd);
}
}
for (osd_num_t pg_osd: pg.all_peers)
{
if (pg_osd != 0)
{
all_peers.insert(pg_osd);
}
}
for (auto & hist_item: pg.target_history)
{
for (auto pg_osd: hist_item)
{
if (pg_osd != 0)
{
all_peers.insert(pg_osd);
}
}
}
for (osd_num_t peer_osd: all_peers)
{
r->list_osds.push_back((rm_pg_osd_t){ .pg = r, .osd_num = peer_osd, .sent = false });
}
}
else
{
r->list_osds.push_back((rm_pg_osd_t){ .pg = r, .osd_num = pg.cur_primary, .sent = false });
}
r->to_list = r->list_osds.size();
lists.push_back(r); lists.push_back(r);
} }
std::sort(lists.begin(), lists.end(), [](rm_pg_osd_t *a, rm_pg_osd_t *b) std::sort(lists.begin(), lists.end(), [](rm_pg_t *a, rm_pg_t *b)
{ {
return a->osd_num < b->osd_num ? true : false; return a->rm_osd_num < b->rm_osd_num ? true : false;
}); });
pgs_to_list = lists.size(); pgs_to_list = lists.size();
started = true; started = true;
@ -160,6 +210,10 @@ public:
void send_list(rm_pg_osd_t *cur_list) void send_list(rm_pg_osd_t *cur_list)
{ {
if (cur_list->sent)
{
return;
}
if (cli->msgr.osd_peer_fds.find(cur_list->osd_num) == if (cli->msgr.osd_peer_fds.find(cur_list->osd_num) ==
cli->msgr.osd_peer_fds.end()) cli->msgr.osd_peer_fds.end())
{ {
@ -177,7 +231,7 @@ public:
.id = cli->msgr.next_subop_id++, .id = cli->msgr.next_subop_id++,
.opcode = OSD_OP_SEC_LIST, .opcode = OSD_OP_SEC_LIST,
}, },
.list_pg = cur_list->pg_num, .list_pg = cur_list->pg->pg_num,
.pg_count = (pg_num_t)cli->st_cli.pool_config[pool_id].real_pg_count, .pg_count = (pg_num_t)cli->st_cli.pool_config[pool_id].real_pg_count,
.pg_stripe_size = cli->st_cli.pool_config[pool_id].pg_stripe_size, .pg_stripe_size = cli->st_cli.pool_config[pool_id].pg_stripe_size,
.min_inode = inode, .min_inode = inode,
@ -186,53 +240,67 @@ public:
}; };
op->callback = [this, cur_list](osd_op_t *op) op->callback = [this, cur_list](osd_op_t *op)
{ {
pgs_to_list--; cur_list->pg->to_list--;
if (op->reply.hdr.retval < 0) if (op->reply.hdr.retval < 0)
{ {
fprintf(stderr, "Failed to get object list from OSD %lu (retval=%ld), skipping the PG\n", fprintf(stderr, "Failed to get PG %u/%u object list from OSD %lu (retval=%ld), skipping\n",
cur_list->osd_num, op->reply.hdr.retval); pool_id, cur_list->pg->pg_num, cur_list->osd_num, op->reply.hdr.retval);
cli->msgr.stop_client(cur_list->osd_num);
delete op;
cur_list->state = RM_END;
continue_delete();
return;
} }
if (log_level > 0) else
{ {
printf( if (op->reply.sec_list.stable_count < op->reply.hdr.retval)
"[PG %u/%u] Got inode object list from OSD %lu: %ld object versions\n", {
pool_id, cur_list->pg_num, cur_list->osd_num, op->reply.hdr.retval // Unstable objects, if present, mean that someone still writes into the inode. Warn the user about it.
); printf(
"[PG %u/%u] Inode still has %lu unstable object versions - is it still open? Not a good idea to delete it.\n",
pool_id, cur_list->pg->pg_num, op->reply.hdr.retval - op->reply.sec_list.stable_count
);
}
if (log_level > 0)
{
printf(
"[PG %u/%u] Got inode object list from OSD %lu: %ld object versions\n",
pool_id, cur_list->pg->pg_num, cur_list->osd_num, op->reply.hdr.retval
);
}
for (uint64_t i = 0; i < op->reply.hdr.retval; i++)
{
object_id oid = ((obj_ver_id*)op->buf)[i].oid;
oid.stripe = oid.stripe & ~STRIPE_MASK;
cur_list->pg->objects.insert(oid);
}
} }
cur_list->obj_list = (obj_ver_id*)op->buf;
cur_list->obj_count = (uint64_t)op->reply.hdr.retval;
cur_list->obj_done = cur_list->obj_prev_done = cur_list->obj_pos = 0;
total_count += cur_list->obj_count;
total_prev_pct = 0;
// set op->buf to NULL so it doesn't get freed
op->buf = NULL;
delete op; delete op;
cur_list->state = RM_REMOVING; if (cur_list->pg->to_list <= 0)
{
cur_list->pg->obj_done = cur_list->pg->obj_prev_done = 0;
cur_list->pg->obj_pos = cur_list->pg->objects.begin();
cur_list->pg->obj_count = cur_list->pg->objects.size();
total_count += cur_list->pg->obj_count;
total_prev_pct = 0;
cur_list->pg->state = RM_REMOVING;
pgs_to_list--;
}
continue_delete(); continue_delete();
}; };
cur_list->state = RM_LIST_SENT;
cli->msgr.outbox_push(op); cli->msgr.outbox_push(op);
cur_list->sent = true;
} }
void send_ops(rm_pg_osd_t *cur_list) void send_ops(rm_pg_t *cur_list)
{ {
if (cli->msgr.osd_peer_fds.find(cur_list->osd_num) == if (cli->msgr.osd_peer_fds.find(cur_list->rm_osd_num) ==
cli->msgr.osd_peer_fds.end()) cli->msgr.osd_peer_fds.end())
{ {
// Initiate connection // Initiate connection
cli->msgr.connect_peer(cur_list->osd_num, cli->st_cli.peer_states[cur_list->osd_num]); cli->msgr.connect_peer(cur_list->rm_osd_num, cli->st_cli.peer_states[cur_list->rm_osd_num]);
return; return;
} }
while (cur_list->in_flight < iodepth && cur_list->obj_pos < cur_list->obj_count) while (cur_list->in_flight < iodepth && cur_list->obj_pos != cur_list->objects.end())
{ {
osd_op_t *op = new osd_op_t(); osd_op_t *op = new osd_op_t();
op->op_type = OSD_OP_OUT; op->op_type = OSD_OP_OUT;
op->peer_fd = cli->msgr.osd_peer_fds[cur_list->osd_num]; op->peer_fd = cli->msgr.osd_peer_fds[cur_list->rm_osd_num];
op->req = (osd_any_op_t){ op->req = (osd_any_op_t){
.rw = { .rw = {
.header = { .header = {
@ -240,8 +308,8 @@ public:
.id = cli->msgr.next_subop_id++, .id = cli->msgr.next_subop_id++,
.opcode = OSD_OP_DELETE, .opcode = OSD_OP_DELETE,
}, },
.inode = cur_list->obj_list[cur_list->obj_pos].oid.inode, .inode = cur_list->obj_pos->inode,
.offset = (cur_list->obj_list[cur_list->obj_pos].oid.stripe & ~STRIPE_MASK), .offset = (cur_list->obj_pos->stripe & ~STRIPE_MASK),
.len = 0, .len = 0,
}, },
}; };
@ -251,7 +319,7 @@ public:
if (op->reply.hdr.retval < 0) if (op->reply.hdr.retval < 0)
{ {
fprintf(stderr, "Failed to remove object from PG %u (OSD %lu) (retval=%ld)\n", fprintf(stderr, "Failed to remove object from PG %u (OSD %lu) (retval=%ld)\n",
cur_list->pg_num, cur_list->osd_num, op->reply.hdr.retval); cur_list->pg_num, cur_list->rm_osd_num, op->reply.hdr.retval);
} }
delete op; delete op;
cur_list->obj_done++; cur_list->obj_done++;
@ -262,12 +330,10 @@ public:
cur_list->obj_pos++; cur_list->obj_pos++;
cur_list->in_flight++; cur_list->in_flight++;
} }
if (!cur_list->in_flight && cur_list->obj_pos >= cur_list->obj_count) if (!cur_list->in_flight && cur_list->obj_pos == cur_list->objects.end())
{ {
free(cur_list->obj_list);
cur_list->obj_list = NULL;
cur_list->obj_count = 0; cur_list->obj_count = 0;
cur_list->obj_done = cur_list->obj_prev_done = cur_list->obj_pos = 0; cur_list->obj_done = cur_list->obj_prev_done = 0;
cur_list->state = RM_END; cur_list->state = RM_END;
} }
} }
@ -276,6 +342,22 @@ public:
{ {
int par_osd = 0; int par_osd = 0;
osd_num_t max_seen_osd = 0; osd_num_t max_seen_osd = 0;
bool no_del = false;
if (list_first)
{
int i, n = 0;
for (i = 0; i < lists.size(); i++)
{
if (lists[i]->state == RM_LISTING)
{
n++;
}
}
if (n > 0)
{
no_del = true;
}
}
for (int i = 0; i < lists.size(); i++) for (int i = 0; i < lists.size(); i++)
{ {
if (lists[i]->state == RM_END) if (lists[i]->state == RM_END)
@ -284,18 +366,25 @@ public:
lists.erase(lists.begin()+i, lists.begin()+i+1); lists.erase(lists.begin()+i, lists.begin()+i+1);
i--; i--;
} }
else if (lists[i]->osd_num > max_seen_osd) else if (lists[i]->rm_osd_num > max_seen_osd)
{ {
if (lists[i]->state == RM_NO_LIST) if (lists[i]->state == RM_LISTING)
{ {
send_list(lists[i]); for (int j = 0; j < lists[i]->list_osds.size(); j++)
{
send_list(&lists[i]->list_osds[j]);
}
} }
else if (lists[i]->state == RM_REMOVING) else if (lists[i]->state == RM_REMOVING)
{ {
if (no_del)
{
continue;
}
send_ops(lists[i]); send_ops(lists[i]);
} }
par_osd++; par_osd++;
max_seen_osd = lists[i]->osd_num; max_seen_osd = lists[i]->rm_osd_num;
if (par_osd >= parallel_osds) if (par_osd >= parallel_osds)
{ {
break; break;