From 61ebed144a892291a4764fc682b508a1906889ed Mon Sep 17 00:00:00 2001 From: Vitaliy Filippov Date: Wed, 9 Feb 2022 10:35:29 +0300 Subject: [PATCH] Fix OSDs possibly dying with "map::at" errors when other OSDs are stopped --- src/cli_rm.cpp | 3 ++- src/cluster_client_list.cpp | 3 ++- src/osd.h | 2 +- src/osd_flush.cpp | 25 ++++++++++++++++------ src/osd_peering.cpp | 4 ++-- src/osd_primary_chain.cpp | 14 +++++++++++-- src/osd_primary_subops.cpp | 42 +++++++++++++++++++++++++++++++------ 7 files changed, 74 insertions(+), 19 deletions(-) diff --git a/src/cli_rm.cpp b/src/cli_rm.cpp index 81a8c210..61b81598 100644 --- a/src/cli_rm.cpp +++ b/src/cli_rm.cpp @@ -96,7 +96,8 @@ struct rm_inode_t { osd_op_t *op = new osd_op_t(); op->op_type = OSD_OP_OUT; - op->peer_fd = parent->cli->msgr.osd_peer_fds[cur_list->rm_osd_num]; + // Already checked that it exists above, but anyway + op->peer_fd = parent->cli->msgr.osd_peer_fds.at(cur_list->rm_osd_num); op->req = (osd_any_op_t){ .rw = { .header = { diff --git a/src/cluster_client_list.cpp b/src/cluster_client_list.cpp index 779a6f76..fa443044 100644 --- a/src/cluster_client_list.cpp +++ b/src/cluster_client_list.cpp @@ -200,7 +200,8 @@ void cluster_client_t::send_list(inode_list_osd_t *cur_list) auto & pool_cfg = st_cli.pool_config[cur_list->pg->lst->pool_id]; osd_op_t *op = new osd_op_t(); op->op_type = OSD_OP_OUT; - op->peer_fd = msgr.osd_peer_fds[cur_list->osd_num]; + // Already checked that it exists above, but anyway + op->peer_fd = msgr.osd_peer_fds.at(cur_list->osd_num); op->req = (osd_any_op_t){ .sec_list = { .header = { diff --git a/src/osd.h b/src/osd.h index 3ead9bd4..6770ec3a 100644 --- a/src/osd.h +++ b/src/osd.h @@ -211,7 +211,7 @@ class osd_t // flushing, recovery and backfill void submit_pg_flush_ops(pg_t & pg); void handle_flush_op(bool rollback, pool_id_t pool_id, pg_num_t pg_num, pg_flush_batch_t *fb, osd_num_t peer_osd, int retval); - void submit_flush_op(pool_id_t pool_id, pg_num_t pg_num, pg_flush_batch_t *fb, bool rollback, osd_num_t peer_osd, int count, obj_ver_id *data); + bool submit_flush_op(pool_id_t pool_id, pg_num_t pg_num, pg_flush_batch_t *fb, bool rollback, osd_num_t peer_osd, int count, obj_ver_id *data); bool pick_next_recovery(osd_recovery_op_t &op); void submit_recovery_op(osd_recovery_op_t *op); bool continue_recovery(); diff --git a/src/osd_flush.cpp b/src/osd_flush.cpp index 2217c666..83b345ea 100644 --- a/src/osd_flush.cpp +++ b/src/osd_flush.cpp @@ -47,7 +47,8 @@ void osd_t::submit_pg_flush_ops(pg_t & pg) if (l.second.size() > 0) { fb->flush_ops++; - submit_flush_op(pg.pool_id, pg.pg_num, fb, true, l.first, l.second.size(), l.second.data()); + if (!submit_flush_op(pg.pool_id, pg.pg_num, fb, true, l.first, l.second.size(), l.second.data())) + return; } } for (auto & l: fb->stable_lists) @@ -55,7 +56,8 @@ void osd_t::submit_pg_flush_ops(pg_t & pg) if (l.second.size() > 0) { fb->flush_ops++; - submit_flush_op(pg.pool_id, pg.pg_num, fb, false, l.first, l.second.size(), l.second.data()); + if (!submit_flush_op(pg.pool_id, pg.pg_num, fb, false, l.first, l.second.size(), l.second.data())) + return; } } } @@ -160,7 +162,7 @@ void osd_t::handle_flush_op(bool rollback, pool_id_t pool_id, pg_num_t pg_num, p } } -void osd_t::submit_flush_op(pool_id_t pool_id, pg_num_t pg_num, pg_flush_batch_t *fb, bool rollback, osd_num_t peer_osd, int count, obj_ver_id *data) +bool osd_t::submit_flush_op(pool_id_t pool_id, pg_num_t pg_num, pg_flush_batch_t *fb, bool rollback, osd_num_t peer_osd, int count, obj_ver_id *data) { osd_op_t *op = new osd_op_t(); // Copy buffer so it gets freed along with the operation @@ -188,10 +190,8 @@ void osd_t::submit_flush_op(pool_id_t pool_id, pg_num_t pg_num, pg_flush_batch_t else { // Peer - int peer_fd = msgr.osd_peer_fds[peer_osd]; op->op_type = OSD_OP_OUT; op->iov.push_back(op->buf, count * sizeof(obj_ver_id)); - op->peer_fd = peer_fd; op->req = (osd_any_op_t){ .sec_stab = { .header = { @@ -207,8 +207,21 @@ void osd_t::submit_flush_op(pool_id_t pool_id, pg_num_t pg_num, pg_flush_batch_t handle_flush_op(op->req.hdr.opcode == OSD_OP_SEC_ROLLBACK, pool_id, pg_num, fb, peer_osd, op->reply.hdr.retval); delete op; }; - msgr.outbox_push(op); + auto peer_fd_it = msgr.osd_peer_fds.find(peer_osd); + if (peer_fd_it != msgr.osd_peer_fds.end()) + { + op->peer_fd = peer_fd_it->second; + msgr.outbox_push(op); + } + else + { + // Fail it immediately + op->reply.hdr.retval = -EPIPE; + op->callback(op); + return false; + } } + return true; } bool osd_t::pick_next_recovery(osd_recovery_op_t &op) diff --git a/src/osd_peering.cpp b/src/osd_peering.cpp index 69bd002c..07e5a280 100644 --- a/src/osd_peering.cpp +++ b/src/osd_peering.cpp @@ -340,7 +340,7 @@ void osd_t::submit_sync_and_list_subop(osd_num_t role_osd, pg_peering_state_t *p else { // Peer - auto & cl = msgr.clients.at(msgr.osd_peer_fds[role_osd]); + auto & cl = msgr.clients.at(msgr.osd_peer_fds.at(role_osd)); osd_op_t *op = new osd_op_t(); op->op_type = OSD_OP_OUT; op->peer_fd = cl->peer_fd; @@ -419,7 +419,7 @@ void osd_t::submit_list_subop(osd_num_t role_osd, pg_peering_state_t *ps) // Peer osd_op_t *op = new osd_op_t(); op->op_type = OSD_OP_OUT; - op->peer_fd = msgr.osd_peer_fds[role_osd]; + op->peer_fd = msgr.osd_peer_fds.at(role_osd); op->req = (osd_any_op_t){ .sec_list = { .header = { diff --git a/src/osd_primary_chain.cpp b/src/osd_primary_chain.cpp index 6f84b088..22ae5f78 100644 --- a/src/osd_primary_chain.cpp +++ b/src/osd_primary_chain.cpp @@ -246,7 +246,6 @@ int osd_t::submit_bitmap_subops(osd_op_t *cur_op, pg_t & pg) // Send to a remote OSD osd_op_t *subop = op_data->subops+subop_idx; subop->op_type = OSD_OP_OUT; - subop->peer_fd = msgr.osd_peer_fds.at(subop_osd_num); // FIXME: Use the pre-allocated buffer subop->buf = malloc_or_die(sizeof(obj_ver_id)*(i+1-prev)); subop->req = (osd_any_op_t){ @@ -287,7 +286,18 @@ int osd_t::submit_bitmap_subops(osd_op_t *cur_op, pg_t & pg) } handle_primary_subop(subop, cur_op); }; - msgr.outbox_push(subop); + auto peer_fd_it = msgr.osd_peer_fds.find(subop_osd_num); + if (peer_fd_it != msgr.osd_peer_fds.end()) + { + subop->peer_fd = peer_fd_it->second; + msgr.outbox_push(subop); + } + else + { + // Fail it immediately + subop->reply.hdr.retval = -EPIPE; + subop->callback(subop); + } subop_idx++; } prev = i+1; diff --git a/src/osd_primary_subops.cpp b/src/osd_primary_subops.cpp index 56bd6f0f..220f3c35 100644 --- a/src/osd_primary_subops.cpp +++ b/src/osd_primary_subops.cpp @@ -182,7 +182,6 @@ int osd_t::submit_primary_subop_batch(int submit_type, inode_t inode, uint64_t o else { subop->op_type = OSD_OP_OUT; - subop->peer_fd = msgr.osd_peer_fds.at(role_osd_num); subop->bitmap = stripes[stripe_num].bmp_buf; subop->bitmap_len = clean_entry_bitmap_size; subop->req.sec_rw = { @@ -225,7 +224,18 @@ int osd_t::submit_primary_subop_batch(int submit_type, inode_t inode, uint64_t o { handle_primary_subop(subop, cur_op); }; - msgr.outbox_push(subop); + auto peer_fd_it = msgr.osd_peer_fds.find(role_osd_num); + if (peer_fd_it != msgr.osd_peer_fds.end()) + { + subop->peer_fd = peer_fd_it->second; + msgr.outbox_push(subop); + } + else + { + // Fail it immediately + subop->reply.hdr.retval = -EPIPE; + subop->callback(subop); + } } i++; } @@ -463,7 +473,6 @@ void osd_t::submit_primary_del_batch(osd_op_t *cur_op, obj_ver_osd_t *chunks_to_ else { subops[i].op_type = OSD_OP_OUT; - subops[i].peer_fd = msgr.osd_peer_fds.at(chunk.osd_num); subops[i].req = (osd_any_op_t){ .sec_del = { .header = { .magic = SECONDARY_OSD_OP_MAGIC, @@ -477,7 +486,18 @@ void osd_t::submit_primary_del_batch(osd_op_t *cur_op, obj_ver_osd_t *chunks_to_ { handle_primary_subop(subop, cur_op); }; - msgr.outbox_push(&subops[i]); + auto peer_fd_it = msgr.osd_peer_fds.find(chunk.osd_num); + if (peer_fd_it != msgr.osd_peer_fds.end()) + { + subops[i].peer_fd = peer_fd_it->second; + msgr.outbox_push(&subops[i]); + } + else + { + // Fail it immediately + subops[i].reply.hdr.retval = -EPIPE; + subops[i].callback(&subops[i]); + } } } } @@ -567,7 +587,6 @@ void osd_t::submit_primary_stab_subops(osd_op_t *cur_op) else { subops[i].op_type = OSD_OP_OUT; - subops[i].peer_fd = msgr.osd_peer_fds.at(stab_osd.osd_num); subops[i].req = (osd_any_op_t){ .sec_stab = { .header = { .magic = SECONDARY_OSD_OP_MAGIC, @@ -581,7 +600,18 @@ void osd_t::submit_primary_stab_subops(osd_op_t *cur_op) { handle_primary_subop(subop, cur_op); }; - msgr.outbox_push(&subops[i]); + auto peer_fd_it = msgr.osd_peer_fds.find(stab_osd.osd_num); + if (peer_fd_it != msgr.osd_peer_fds.end()) + { + subops[i].peer_fd = peer_fd_it->second; + msgr.outbox_push(&subops[i]); + } + else + { + // Fail it immediately + subops[i].reply.hdr.retval = -EPIPE; + subops[i].callback(&subops[i]); + } } } }