Do not crash on PG re-peering events when operations are in progress

rel-0.5
Vitaliy Filippov 2021-04-07 01:29:30 +03:00
parent f6d705383a
commit 97efb9e299
11 changed files with 81 additions and 79 deletions

View File

@ -183,7 +183,7 @@ const etcd_tree = {
/* <pool_id>: { /* <pool_id>: {
<pg_id>: { <pg_id>: {
primary: osd_num_t, primary: osd_num_t,
state: ("starting"|"peering"|"incomplete"|"active"|"stopping"|"offline"| state: ("starting"|"peering"|"incomplete"|"active"|"repeering"|"stopping"|"offline"|
"degraded"|"has_incomplete"|"has_degraded"|"has_misplaced"|"has_unclean"| "degraded"|"has_incomplete"|"has_degraded"|"has_misplaced"|"has_unclean"|
"has_invalid"|"left_on_dead")[], "has_invalid"|"left_on_dead")[],
} }

View File

@ -558,7 +558,7 @@ void osd_t::apply_pg_config()
} }
if (currently_taken) if (currently_taken)
{ {
if (pg_it->second.state & (PG_ACTIVE | PG_INCOMPLETE | PG_PEERING)) if (pg_it->second.state & (PG_ACTIVE | PG_INCOMPLETE | PG_PEERING | PG_REPEERING))
{ {
if (pg_it->second.target_set == pg_cfg.target_set) if (pg_it->second.target_set == pg_cfg.target_set)
{ {

View File

@ -149,10 +149,14 @@ void osd_t::handle_flush_op(bool rollback, pool_id_t pool_id, pg_num_t pg_num, p
{ {
continue_primary_write(op); continue_primary_write(op);
} }
if (pg.inflight == 0 && (pg.state & PG_STOPPING)) if ((pg.state & PG_STOPPING) && pg.inflight == 0 && !pg.flush_batch)
{ {
finish_stop_pg(pg); finish_stop_pg(pg);
} }
else if ((pg.state & PG_REPEERING) && pg.inflight == 0 && !pg.flush_batch)
{
start_pg_peering(pg);
}
} }
} }

View File

@ -77,10 +77,11 @@ void osd_t::repeer_pgs(osd_num_t peer_osd)
// Re-peer affected PGs // Re-peer affected PGs
for (auto & p: pgs) for (auto & p: pgs)
{ {
auto & pg = p.second;
bool repeer = false; bool repeer = false;
if (p.second.state & (PG_PEERING | PG_ACTIVE | PG_INCOMPLETE)) if (pg.state & (PG_PEERING | PG_ACTIVE | PG_INCOMPLETE))
{ {
for (osd_num_t pg_osd: p.second.all_peers) for (osd_num_t pg_osd: pg.all_peers)
{ {
if (pg_osd == peer_osd) if (pg_osd == peer_osd)
{ {
@ -91,8 +92,17 @@ void osd_t::repeer_pgs(osd_num_t peer_osd)
if (repeer) if (repeer)
{ {
// Repeer this pg // Repeer this pg
printf("[PG %u/%u] Repeer because of OSD %lu\n", p.second.pool_id, p.second.pg_num, peer_osd); printf("[PG %u/%u] Repeer because of OSD %lu\n", pg.pool_id, pg.pg_num, peer_osd);
start_pg_peering(p.second); if (!(pg.state & (PG_ACTIVE | PG_REPEERING)) || pg.inflight == 0 && !pg.flush_batch)
{
start_pg_peering(pg);
}
else
{
// Stop accepting new operations, wait for current ones to finish or fail
pg.state = pg.state & ~PG_ACTIVE | PG_REPEERING;
report_pg_state(pg);
}
} }
} }
} }
@ -334,9 +344,10 @@ void osd_t::submit_sync_and_list_subop(osd_num_t role_osd, pg_peering_state_t *p
{ {
// FIXME: Mark peer as failed and don't reconnect immediately after dropping the connection // FIXME: Mark peer as failed and don't reconnect immediately after dropping the connection
printf("Failed to sync OSD %lu: %ld (%s), disconnecting peer\n", role_osd, op->reply.hdr.retval, strerror(-op->reply.hdr.retval)); printf("Failed to sync OSD %lu: %ld (%s), disconnecting peer\n", role_osd, op->reply.hdr.retval, strerror(-op->reply.hdr.retval));
int fail_fd = op->peer_fd;
ps->list_ops.erase(role_osd); ps->list_ops.erase(role_osd);
c_cli.stop_client(op->peer_fd);
delete op; delete op;
c_cli.stop_client(fail_fd);
return; return;
} }
delete op; delete op;
@ -413,9 +424,10 @@ void osd_t::submit_list_subop(osd_num_t role_osd, pg_peering_state_t *ps)
if (op->reply.hdr.retval < 0) if (op->reply.hdr.retval < 0)
{ {
printf("Failed to get object list from OSD %lu (retval=%ld), disconnecting peer\n", role_osd, op->reply.hdr.retval); printf("Failed to get object list from OSD %lu (retval=%ld), disconnecting peer\n", role_osd, op->reply.hdr.retval);
int fail_fd = op->peer_fd;
ps->list_ops.erase(role_osd); ps->list_ops.erase(role_osd);
c_cli.stop_client(op->peer_fd);
delete op; delete op;
c_cli.stop_client(fail_fd);
return; return;
} }
printf( printf(
@ -484,15 +496,13 @@ bool osd_t::stop_pg(pg_t & pg)
{ {
return false; return false;
} }
if (!(pg.state & PG_ACTIVE)) if (!(pg.state & (PG_ACTIVE | PG_REPEERING)))
{ {
finish_stop_pg(pg); finish_stop_pg(pg);
return true; return true;
} }
pg.state = pg.state & ~PG_ACTIVE | PG_STOPPING; pg.state = pg.state & ~PG_ACTIVE & ~PG_REPEERING | PG_STOPPING;
if (pg.inflight == 0 && !pg.flush_batch && if (pg.inflight == 0 && !pg.flush_batch)
// We must either forget all PG's unstable writes or wait for it to become clean
dirty_pgs.find({ .pool_id = pg.pool_id, .pg_num = pg.pg_num }) == dirty_pgs.end())
{ {
finish_stop_pg(pg); finish_stop_pg(pg);
} }

View File

@ -430,12 +430,13 @@ void pg_t::calc_object_states(int log_level)
void pg_t::print_state() void pg_t::print_state()
{ {
printf( printf(
"[PG %u/%u] is %s%s%s%s%s%s%s%s%s%s%s%s%s (%lu objects)\n", pool_id, pg_num, "[PG %u/%u] is %s%s%s%s%s%s%s%s%s%s%s%s%s%s (%lu objects)\n", pool_id, pg_num,
(state & PG_STARTING) ? "starting" : "", (state & PG_STARTING) ? "starting" : "",
(state & PG_OFFLINE) ? "offline" : "", (state & PG_OFFLINE) ? "offline" : "",
(state & PG_PEERING) ? "peering" : "", (state & PG_PEERING) ? "peering" : "",
(state & PG_INCOMPLETE) ? "incomplete" : "", (state & PG_INCOMPLETE) ? "incomplete" : "",
(state & PG_ACTIVE) ? "active" : "", (state & PG_ACTIVE) ? "active" : "",
(state & PG_REPEERING) ? "repeering" : "",
(state & PG_STOPPING) ? "stopping" : "", (state & PG_STOPPING) ? "stopping" : "",
(state & PG_DEGRADED) ? " + degraded" : "", (state & PG_DEGRADED) ? " + degraded" : "",
(state & PG_HAS_INCOMPLETE) ? " + has_incomplete" : "", (state & PG_HAS_INCOMPLETE) ? " + has_incomplete" : "",

View File

@ -291,20 +291,18 @@ resume_5:
free_object_state(pg, &op_data->object_state); free_object_state(pg, &op_data->object_state);
} }
pg.total_count--; pg.total_count--;
object_id oid = op_data->oid; osd_op_t *next_op = NULL;
finish_op(cur_op, cur_op->req.rw.len); auto next_it = pg.write_queue.find(op_data->oid);
// Continue other write operations to the same object if (next_it != pg.write_queue.end() && next_it->second == cur_op)
auto next_it = pg.write_queue.find(oid);
auto this_it = next_it;
if (this_it != pg.write_queue.end() && this_it->second == cur_op)
{ {
next_it++; pg.write_queue.erase(next_it++);
pg.write_queue.erase(this_it); if (next_it != pg.write_queue.end() && next_it->first == op_data->oid)
if (next_it != pg.write_queue.end() && next_op = next_it->second;
next_it->first == oid) }
{ finish_op(cur_op, cur_op->req.rw.len);
osd_op_t *next_op = next_it->second; if (next_op)
continue_primary_write(next_op); {
} // Continue next write to the same object
continue_primary_write(next_op);
} }
} }

View File

@ -43,12 +43,14 @@ void osd_t::finish_op(osd_op_t *cur_op, int retval)
auto & pg = pgs.at({ .pool_id = INODE_POOL(cur_op->op_data->oid.inode), .pg_num = cur_op->op_data->pg_num }); auto & pg = pgs.at({ .pool_id = INODE_POOL(cur_op->op_data->oid.inode), .pg_num = cur_op->op_data->pg_num });
pg.inflight--; pg.inflight--;
assert(pg.inflight >= 0); assert(pg.inflight >= 0);
if ((pg.state & PG_STOPPING) && pg.inflight == 0 && !pg.flush_batch && if ((pg.state & PG_STOPPING) && pg.inflight == 0 && !pg.flush_batch)
// We must either forget all PG's unstable writes or wait for it to become clean
dirty_pgs.find({ .pool_id = pg.pool_id, .pg_num = pg.pg_num }) == dirty_pgs.end())
{ {
finish_stop_pg(pg); finish_stop_pg(pg);
} }
else if ((pg.state & PG_REPEERING) && pg.inflight == 0 && !pg.flush_batch)
{
start_pg_peering(pg);
}
} }
assert(!cur_op->op_data->subops); assert(!cur_op->op_data->subops);
assert(!cur_op->op_data->unstable_write_osds); assert(!cur_op->op_data->unstable_write_osds);
@ -194,14 +196,7 @@ void osd_t::submit_primary_subops(int submit_type, uint64_t op_version, int pg_s
} }
subops[i].callback = [cur_op, this](osd_op_t *subop) subops[i].callback = [cur_op, this](osd_op_t *subop)
{ {
int fail_fd = subop->req.hdr.opcode == OSD_OP_SEC_WRITE &&
subop->reply.hdr.retval != subop->req.sec_rw.len ? subop->peer_fd : -1;
handle_primary_subop(subop, cur_op); handle_primary_subop(subop, cur_op);
if (fail_fd >= 0)
{
// write operation failed, drop the connection
c_cli.stop_client(fail_fd);
}
}; };
c_cli.outbox_push(&subops[i]); c_cli.outbox_push(&subops[i]);
} }
@ -247,6 +242,7 @@ void osd_t::handle_primary_bs_subop(osd_op_t *subop)
} }
delete bs_op; delete bs_op;
subop->bs_op = NULL; subop->bs_op = NULL;
subop->peer_fd = -1;
handle_primary_subop(subop, cur_op); handle_primary_subop(subop, cur_op);
} }
@ -288,6 +284,11 @@ void osd_t::handle_primary_subop(osd_op_t *subop, osd_op_t *cur_op)
op_data->epipe++; op_data->epipe++;
} }
op_data->errors++; op_data->errors++;
if (subop->peer_fd >= 0)
{
// Drop connection on any error
c_cli.stop_client(subop->peer_fd);
}
} }
else else
{ {
@ -438,13 +439,7 @@ void osd_t::submit_primary_del_batch(osd_op_t *cur_op, obj_ver_osd_t *chunks_to_
} }; } };
subops[i].callback = [cur_op, this](osd_op_t *subop) subops[i].callback = [cur_op, this](osd_op_t *subop)
{ {
int fail_fd = subop->reply.hdr.retval != 0 ? subop->peer_fd : -1;
handle_primary_subop(subop, cur_op); handle_primary_subop(subop, cur_op);
if (fail_fd >= 0)
{
// delete operation failed, drop the connection
c_cli.stop_client(fail_fd);
}
}; };
c_cli.outbox_push(&subops[i]); c_cli.outbox_push(&subops[i]);
} }
@ -489,13 +484,7 @@ int osd_t::submit_primary_sync_subops(osd_op_t *cur_op)
} }; } };
subops[i].callback = [cur_op, this](osd_op_t *subop) subops[i].callback = [cur_op, this](osd_op_t *subop)
{ {
int fail_fd = subop->reply.hdr.retval != 0 ? subop->peer_fd : -1;
handle_primary_subop(subop, cur_op); handle_primary_subop(subop, cur_op);
if (fail_fd >= 0)
{
// sync operation failed, drop the connection
c_cli.stop_client(fail_fd);
}
}; };
c_cli.outbox_push(&subops[i]); c_cli.outbox_push(&subops[i]);
} }
@ -554,13 +543,7 @@ void osd_t::submit_primary_stab_subops(osd_op_t *cur_op)
subops[i].iov.push_back(op_data->unstable_writes + stab_osd.start, stab_osd.len * sizeof(obj_ver_id)); subops[i].iov.push_back(op_data->unstable_writes + stab_osd.start, stab_osd.len * sizeof(obj_ver_id));
subops[i].callback = [cur_op, this](osd_op_t *subop) subops[i].callback = [cur_op, this](osd_op_t *subop)
{ {
int fail_fd = subop->reply.hdr.retval != 0 ? subop->peer_fd : -1;
handle_primary_subop(subop, cur_op); handle_primary_subop(subop, cur_op);
if (fail_fd >= 0)
{
// sync operation failed, drop the connection
c_cli.stop_client(fail_fd);
}
}; };
c_cli.outbox_push(&subops[i]); c_cli.outbox_push(&subops[i]);
} }
@ -578,7 +561,7 @@ void osd_t::pg_cancel_write_queue(pg_t & pg, osd_op_t *first_op, object_id oid,
return; return;
} }
std::vector<osd_op_t*> cancel_ops; std::vector<osd_op_t*> cancel_ops;
while (it != pg.write_queue.end()) while (it != pg.write_queue.end() && it->first == oid)
{ {
cancel_ops.push_back(it->second); cancel_ops.push_back(it->second);
it++; it++;

View File

@ -218,12 +218,14 @@ resume_8:
{ {
auto & pg = pgs.at(op_data->dirty_pgs[i]); auto & pg = pgs.at(op_data->dirty_pgs[i]);
pg.inflight--; pg.inflight--;
if ((pg.state & PG_STOPPING) && pg.inflight == 0 && !pg.flush_batch && if ((pg.state & PG_STOPPING) && pg.inflight == 0 && !pg.flush_batch)
// We must either forget all PG's unstable writes or wait for it to become clean
dirty_pgs.find({ .pool_id = pg.pool_id, .pg_num = pg.pg_num }) == dirty_pgs.end())
{ {
finish_stop_pg(pg); finish_stop_pg(pg);
} }
else if ((pg.state & PG_REPEERING) && pg.inflight == 0 && !pg.flush_batch)
{
start_pg_peering(pg);
}
} }
// FIXME: Free those in the destructor? // FIXME: Free those in the destructor?
free(op_data->dirty_pgs); free(op_data->dirty_pgs);

View File

@ -252,19 +252,20 @@ resume_9:
} }
cur_op->reply.hdr.retval = cur_op->req.rw.len; cur_op->reply.hdr.retval = cur_op->req.rw.len;
continue_others: continue_others:
object_id oid = op_data->oid; osd_op_t *next_op = NULL;
auto next_it = pg.write_queue.find(op_data->oid);
// Remove the operation from queue before calling finish_op so it doesn't see the completed operation in queue // Remove the operation from queue before calling finish_op so it doesn't see the completed operation in queue
auto next_it = pg.write_queue.find(oid);
if (next_it != pg.write_queue.end() && next_it->second == cur_op) if (next_it != pg.write_queue.end() && next_it->second == cur_op)
{ {
pg.write_queue.erase(next_it++); pg.write_queue.erase(next_it++);
if (next_it != pg.write_queue.end() && next_it->first == op_data->oid)
next_op = next_it->second;
} }
// finish_op would invalidate next_it if it cleared pg.write_queue, but it doesn't do that :) // finish_op would invalidate next_it if it cleared pg.write_queue, but it doesn't do that :)
finish_op(cur_op, cur_op->reply.hdr.retval); finish_op(cur_op, cur_op->req.rw.len);
// Continue other write operations to the same object if (next_op)
if (next_it != pg.write_queue.end() && next_it->first == oid)
{ {
osd_op_t *next_op = next_it->second; // Continue next write to the same object
continue_primary_write(next_op); continue_primary_write(next_op);
} }
} }

View File

@ -3,13 +3,14 @@
#include "pg_states.h" #include "pg_states.h"
const int pg_state_bit_count = 14; const int pg_state_bit_count = 15;
const int pg_state_bits[14] = { const int pg_state_bits[15] = {
PG_STARTING, PG_STARTING,
PG_PEERING, PG_PEERING,
PG_INCOMPLETE, PG_INCOMPLETE,
PG_ACTIVE, PG_ACTIVE,
PG_REPEERING,
PG_STOPPING, PG_STOPPING,
PG_OFFLINE, PG_OFFLINE,
PG_DEGRADED, PG_DEGRADED,
@ -21,11 +22,12 @@ const int pg_state_bits[14] = {
PG_LEFT_ON_DEAD, PG_LEFT_ON_DEAD,
}; };
const char *pg_state_names[14] = { const char *pg_state_names[15] = {
"starting", "starting",
"peering", "peering",
"incomplete", "incomplete",
"active", "active",
"repeering",
"stopping", "stopping",
"offline", "offline",
"degraded", "degraded",

View File

@ -10,16 +10,17 @@
#define PG_PEERING (1<<1) #define PG_PEERING (1<<1)
#define PG_INCOMPLETE (1<<2) #define PG_INCOMPLETE (1<<2)
#define PG_ACTIVE (1<<3) #define PG_ACTIVE (1<<3)
#define PG_STOPPING (1<<4) #define PG_REPEERING (1<<4)
#define PG_OFFLINE (1<<5) #define PG_STOPPING (1<<5)
#define PG_OFFLINE (1<<6)
// Plus any of these: // Plus any of these:
#define PG_DEGRADED (1<<6) #define PG_DEGRADED (1<<7)
#define PG_HAS_INCOMPLETE (1<<7) #define PG_HAS_INCOMPLETE (1<<8)
#define PG_HAS_DEGRADED (1<<8) #define PG_HAS_DEGRADED (1<<9)
#define PG_HAS_MISPLACED (1<<9) #define PG_HAS_MISPLACED (1<<10)
#define PG_HAS_UNCLEAN (1<<10) #define PG_HAS_UNCLEAN (1<<11)
#define PG_HAS_INVALID (1<<11) #define PG_HAS_INVALID (1<<12)
#define PG_LEFT_ON_DEAD (1<<12) #define PG_LEFT_ON_DEAD (1<<13)
// Lower bits that represent object role (EC 0/1/2... or always 0 with replication) // Lower bits that represent object role (EC 0/1/2... or always 0 with replication)
// 12 bits is a safe default that doesn't depend on pg_stripe_size or pg_block_size // 12 bits is a safe default that doesn't depend on pg_stripe_size or pg_block_size