Do not crash on PG re-peering events when operations are in progress

rel-0.5
Vitaliy Filippov 2021-04-07 01:29:30 +03:00
commit 97efb9e299
11 arquivos alterados com 81 adições e 79 exclusões

Ver arquivo

@ -183,7 +183,7 @@ const etcd_tree = {
/* <pool_id>: {
<pg_id>: {
primary: osd_num_t,
state: ("starting"|"peering"|"incomplete"|"active"|"stopping"|"offline"|
state: ("starting"|"peering"|"incomplete"|"active"|"repeering"|"stopping"|"offline"|
"degraded"|"has_incomplete"|"has_degraded"|"has_misplaced"|"has_unclean"|
"has_invalid"|"left_on_dead")[],
}

Ver arquivo

@ -558,7 +558,7 @@ void osd_t::apply_pg_config()
}
if (currently_taken)
{
if (pg_it->second.state & (PG_ACTIVE | PG_INCOMPLETE | PG_PEERING))
if (pg_it->second.state & (PG_ACTIVE | PG_INCOMPLETE | PG_PEERING | PG_REPEERING))
{
if (pg_it->second.target_set == pg_cfg.target_set)
{

Ver arquivo

@ -149,10 +149,14 @@ void osd_t::handle_flush_op(bool rollback, pool_id_t pool_id, pg_num_t pg_num, p
{
continue_primary_write(op);
}
if (pg.inflight == 0 && (pg.state & PG_STOPPING))
if ((pg.state & PG_STOPPING) && pg.inflight == 0 && !pg.flush_batch)
{
finish_stop_pg(pg);
}
else if ((pg.state & PG_REPEERING) && pg.inflight == 0 && !pg.flush_batch)
{
start_pg_peering(pg);
}
}
}

Ver arquivo

@ -77,10 +77,11 @@ void osd_t::repeer_pgs(osd_num_t peer_osd)
// Re-peer affected PGs
for (auto & p: pgs)
{
auto & pg = p.second;
bool repeer = false;
if (p.second.state & (PG_PEERING | PG_ACTIVE | PG_INCOMPLETE))
if (pg.state & (PG_PEERING | PG_ACTIVE | PG_INCOMPLETE))
{
for (osd_num_t pg_osd: p.second.all_peers)
for (osd_num_t pg_osd: pg.all_peers)
{
if (pg_osd == peer_osd)
{
@ -91,8 +92,17 @@ void osd_t::repeer_pgs(osd_num_t peer_osd)
if (repeer)
{
// Repeer this pg
printf("[PG %u/%u] Repeer because of OSD %lu\n", p.second.pool_id, p.second.pg_num, peer_osd);
start_pg_peering(p.second);
printf("[PG %u/%u] Repeer because of OSD %lu\n", pg.pool_id, pg.pg_num, peer_osd);
if (!(pg.state & (PG_ACTIVE | PG_REPEERING)) || pg.inflight == 0 && !pg.flush_batch)
{
start_pg_peering(pg);
}
else
{
// Stop accepting new operations, wait for current ones to finish or fail
pg.state = pg.state & ~PG_ACTIVE | PG_REPEERING;
report_pg_state(pg);
}
}
}
}
@ -334,9 +344,10 @@ void osd_t::submit_sync_and_list_subop(osd_num_t role_osd, pg_peering_state_t *p
{
// FIXME: Mark peer as failed and don't reconnect immediately after dropping the connection
printf("Failed to sync OSD %lu: %ld (%s), disconnecting peer\n", role_osd, op->reply.hdr.retval, strerror(-op->reply.hdr.retval));
int fail_fd = op->peer_fd;
ps->list_ops.erase(role_osd);
c_cli.stop_client(op->peer_fd);
delete op;
c_cli.stop_client(fail_fd);
return;
}
delete op;
@ -413,9 +424,10 @@ void osd_t::submit_list_subop(osd_num_t role_osd, pg_peering_state_t *ps)
if (op->reply.hdr.retval < 0)
{
printf("Failed to get object list from OSD %lu (retval=%ld), disconnecting peer\n", role_osd, op->reply.hdr.retval);
int fail_fd = op->peer_fd;
ps->list_ops.erase(role_osd);
c_cli.stop_client(op->peer_fd);
delete op;
c_cli.stop_client(fail_fd);
return;
}
printf(
@ -484,15 +496,13 @@ bool osd_t::stop_pg(pg_t & pg)
{
return false;
}
if (!(pg.state & PG_ACTIVE))
if (!(pg.state & (PG_ACTIVE | PG_REPEERING)))
{
finish_stop_pg(pg);
return true;
}
pg.state = pg.state & ~PG_ACTIVE | PG_STOPPING;
if (pg.inflight == 0 && !pg.flush_batch &&
// We must either forget all PG's unstable writes or wait for it to become clean
dirty_pgs.find({ .pool_id = pg.pool_id, .pg_num = pg.pg_num }) == dirty_pgs.end())
pg.state = pg.state & ~PG_ACTIVE & ~PG_REPEERING | PG_STOPPING;
if (pg.inflight == 0 && !pg.flush_batch)
{
finish_stop_pg(pg);
}

Ver arquivo

@ -430,12 +430,13 @@ void pg_t::calc_object_states(int log_level)
void pg_t::print_state()
{
printf(
"[PG %u/%u] is %s%s%s%s%s%s%s%s%s%s%s%s%s (%lu objects)\n", pool_id, pg_num,
"[PG %u/%u] is %s%s%s%s%s%s%s%s%s%s%s%s%s%s (%lu objects)\n", pool_id, pg_num,
(state & PG_STARTING) ? "starting" : "",
(state & PG_OFFLINE) ? "offline" : "",
(state & PG_PEERING) ? "peering" : "",
(state & PG_INCOMPLETE) ? "incomplete" : "",
(state & PG_ACTIVE) ? "active" : "",
(state & PG_REPEERING) ? "repeering" : "",
(state & PG_STOPPING) ? "stopping" : "",
(state & PG_DEGRADED) ? " + degraded" : "",
(state & PG_HAS_INCOMPLETE) ? " + has_incomplete" : "",

Ver arquivo

@ -291,20 +291,18 @@ resume_5:
free_object_state(pg, &op_data->object_state);
}
pg.total_count--;
object_id oid = op_data->oid;
finish_op(cur_op, cur_op->req.rw.len);
// Continue other write operations to the same object
auto next_it = pg.write_queue.find(oid);
auto this_it = next_it;
if (this_it != pg.write_queue.end() && this_it->second == cur_op)
osd_op_t *next_op = NULL;
auto next_it = pg.write_queue.find(op_data->oid);
if (next_it != pg.write_queue.end() && next_it->second == cur_op)
{
next_it++;
pg.write_queue.erase(this_it);
if (next_it != pg.write_queue.end() &&
next_it->first == oid)
{
osd_op_t *next_op = next_it->second;
continue_primary_write(next_op);
}
pg.write_queue.erase(next_it++);
if (next_it != pg.write_queue.end() && next_it->first == op_data->oid)
next_op = next_it->second;
}
finish_op(cur_op, cur_op->req.rw.len);
if (next_op)
{
// Continue next write to the same object
continue_primary_write(next_op);
}
}

Ver arquivo

@ -43,12 +43,14 @@ void osd_t::finish_op(osd_op_t *cur_op, int retval)
auto & pg = pgs.at({ .pool_id = INODE_POOL(cur_op->op_data->oid.inode), .pg_num = cur_op->op_data->pg_num });
pg.inflight--;
assert(pg.inflight >= 0);
if ((pg.state & PG_STOPPING) && pg.inflight == 0 && !pg.flush_batch &&
// We must either forget all PG's unstable writes or wait for it to become clean
dirty_pgs.find({ .pool_id = pg.pool_id, .pg_num = pg.pg_num }) == dirty_pgs.end())
if ((pg.state & PG_STOPPING) && pg.inflight == 0 && !pg.flush_batch)
{
finish_stop_pg(pg);
}
else if ((pg.state & PG_REPEERING) && pg.inflight == 0 && !pg.flush_batch)
{
start_pg_peering(pg);
}
}
assert(!cur_op->op_data->subops);
assert(!cur_op->op_data->unstable_write_osds);
@ -194,14 +196,7 @@ void osd_t::submit_primary_subops(int submit_type, uint64_t op_version, int pg_s
}
subops[i].callback = [cur_op, this](osd_op_t *subop)
{
int fail_fd = subop->req.hdr.opcode == OSD_OP_SEC_WRITE &&
subop->reply.hdr.retval != subop->req.sec_rw.len ? subop->peer_fd : -1;
handle_primary_subop(subop, cur_op);
if (fail_fd >= 0)
{
// write operation failed, drop the connection
c_cli.stop_client(fail_fd);
}
};
c_cli.outbox_push(&subops[i]);
}
@ -247,6 +242,7 @@ void osd_t::handle_primary_bs_subop(osd_op_t *subop)
}
delete bs_op;
subop->bs_op = NULL;
subop->peer_fd = -1;
handle_primary_subop(subop, cur_op);
}
@ -288,6 +284,11 @@ void osd_t::handle_primary_subop(osd_op_t *subop, osd_op_t *cur_op)
op_data->epipe++;
}
op_data->errors++;
if (subop->peer_fd >= 0)
{
// Drop connection on any error
c_cli.stop_client(subop->peer_fd);
}
}
else
{
@ -438,13 +439,7 @@ void osd_t::submit_primary_del_batch(osd_op_t *cur_op, obj_ver_osd_t *chunks_to_
} };
subops[i].callback = [cur_op, this](osd_op_t *subop)
{
int fail_fd = subop->reply.hdr.retval != 0 ? subop->peer_fd : -1;
handle_primary_subop(subop, cur_op);
if (fail_fd >= 0)
{
// delete operation failed, drop the connection
c_cli.stop_client(fail_fd);
}
};
c_cli.outbox_push(&subops[i]);
}
@ -489,13 +484,7 @@ int osd_t::submit_primary_sync_subops(osd_op_t *cur_op)
} };
subops[i].callback = [cur_op, this](osd_op_t *subop)
{
int fail_fd = subop->reply.hdr.retval != 0 ? subop->peer_fd : -1;
handle_primary_subop(subop, cur_op);
if (fail_fd >= 0)
{
// sync operation failed, drop the connection
c_cli.stop_client(fail_fd);
}
};
c_cli.outbox_push(&subops[i]);
}
@ -554,13 +543,7 @@ void osd_t::submit_primary_stab_subops(osd_op_t *cur_op)
subops[i].iov.push_back(op_data->unstable_writes + stab_osd.start, stab_osd.len * sizeof(obj_ver_id));
subops[i].callback = [cur_op, this](osd_op_t *subop)
{
int fail_fd = subop->reply.hdr.retval != 0 ? subop->peer_fd : -1;
handle_primary_subop(subop, cur_op);
if (fail_fd >= 0)
{
// sync operation failed, drop the connection
c_cli.stop_client(fail_fd);
}
};
c_cli.outbox_push(&subops[i]);
}
@ -578,7 +561,7 @@ void osd_t::pg_cancel_write_queue(pg_t & pg, osd_op_t *first_op, object_id oid,
return;
}
std::vector<osd_op_t*> cancel_ops;
while (it != pg.write_queue.end())
while (it != pg.write_queue.end() && it->first == oid)
{
cancel_ops.push_back(it->second);
it++;

Ver arquivo

@ -218,12 +218,14 @@ resume_8:
{
auto & pg = pgs.at(op_data->dirty_pgs[i]);
pg.inflight--;
if ((pg.state & PG_STOPPING) && pg.inflight == 0 && !pg.flush_batch &&
// We must either forget all PG's unstable writes or wait for it to become clean
dirty_pgs.find({ .pool_id = pg.pool_id, .pg_num = pg.pg_num }) == dirty_pgs.end())
if ((pg.state & PG_STOPPING) && pg.inflight == 0 && !pg.flush_batch)
{
finish_stop_pg(pg);
}
else if ((pg.state & PG_REPEERING) && pg.inflight == 0 && !pg.flush_batch)
{
start_pg_peering(pg);
}
}
// FIXME: Free those in the destructor?
free(op_data->dirty_pgs);

Ver arquivo

@ -252,19 +252,20 @@ resume_9:
}
cur_op->reply.hdr.retval = cur_op->req.rw.len;
continue_others:
object_id oid = op_data->oid;
osd_op_t *next_op = NULL;
auto next_it = pg.write_queue.find(op_data->oid);
// Remove the operation from queue before calling finish_op so it doesn't see the completed operation in queue
auto next_it = pg.write_queue.find(oid);
if (next_it != pg.write_queue.end() && next_it->second == cur_op)
{
pg.write_queue.erase(next_it++);
if (next_it != pg.write_queue.end() && next_it->first == op_data->oid)
next_op = next_it->second;
}
// finish_op would invalidate next_it if it cleared pg.write_queue, but it doesn't do that :)
finish_op(cur_op, cur_op->reply.hdr.retval);
// Continue other write operations to the same object
if (next_it != pg.write_queue.end() && next_it->first == oid)
finish_op(cur_op, cur_op->req.rw.len);
if (next_op)
{
osd_op_t *next_op = next_it->second;
// Continue next write to the same object
continue_primary_write(next_op);
}
}

Ver arquivo

@ -3,13 +3,14 @@
#include "pg_states.h"
const int pg_state_bit_count = 14;
const int pg_state_bit_count = 15;
const int pg_state_bits[14] = {
const int pg_state_bits[15] = {
PG_STARTING,
PG_PEERING,
PG_INCOMPLETE,
PG_ACTIVE,
PG_REPEERING,
PG_STOPPING,
PG_OFFLINE,
PG_DEGRADED,
@ -21,11 +22,12 @@ const int pg_state_bits[14] = {
PG_LEFT_ON_DEAD,
};
const char *pg_state_names[14] = {
const char *pg_state_names[15] = {
"starting",
"peering",
"incomplete",
"active",
"repeering",
"stopping",
"offline",
"degraded",

Ver arquivo

@ -10,16 +10,17 @@
#define PG_PEERING (1<<1)
#define PG_INCOMPLETE (1<<2)
#define PG_ACTIVE (1<<3)
#define PG_STOPPING (1<<4)
#define PG_OFFLINE (1<<5)
#define PG_REPEERING (1<<4)
#define PG_STOPPING (1<<5)
#define PG_OFFLINE (1<<6)
// Plus any of these:
#define PG_DEGRADED (1<<6)
#define PG_HAS_INCOMPLETE (1<<7)
#define PG_HAS_DEGRADED (1<<8)
#define PG_HAS_MISPLACED (1<<9)
#define PG_HAS_UNCLEAN (1<<10)
#define PG_HAS_INVALID (1<<11)
#define PG_LEFT_ON_DEAD (1<<12)
#define PG_DEGRADED (1<<7)
#define PG_HAS_INCOMPLETE (1<<8)
#define PG_HAS_DEGRADED (1<<9)
#define PG_HAS_MISPLACED (1<<10)
#define PG_HAS_UNCLEAN (1<<11)
#define PG_HAS_INVALID (1<<12)
#define PG_LEFT_ON_DEAD (1<<13)
// Lower bits that represent object role (EC 0/1/2... or always 0 with replication)
// 12 bits is a safe default that doesn't depend on pg_stripe_size or pg_block_size