From 0f43f6d3f6e3cb7554f1e9e9a6b7926b9765254e Mon Sep 17 00:00:00 2001 From: Vitaliy Filippov Date: Tue, 31 Mar 2020 17:50:50 +0300 Subject: [PATCH] Fix crashes, print some stats Notably: - fix the `delete op` inside lambda callback crash (it frees the lambda itself which results in use-after-free with g++) - fix stop_client() reenterability - fix a bug in the blockstore layer which resulted in always returning version=0 for zero-length reads - change error codes for blockstore_stabilize --- blockstore.h | 2 +- blockstore_impl.cpp | 6 +-- blockstore_impl.h | 2 +- blockstore_read.cpp | 81 +++++++++++++++++++++-------------------- blockstore_rollback.cpp | 2 +- blockstore_stable.cpp | 2 +- blockstore_write.cpp | 2 +- osd.cpp | 42 +++++++++++++-------- osd.h | 3 ++ osd_flush.cpp | 1 + osd_peering.cpp | 6 +++ osd_peering_pg.cpp | 8 ++++ osd_primary.cpp | 36 +++++++++++------- osd_receive.cpp | 3 +- 14 files changed, 118 insertions(+), 78 deletions(-) diff --git a/blockstore.h b/blockstore.h index 670bea33..3d99c29c 100644 --- a/blockstore.h +++ b/blockstore.h @@ -93,7 +93,7 @@ Input: - buf = pre-allocated obj_ver_id array units long Output: -- retval = 0 or negative error number (-EINVAL or -EBUSY if not synced) +- retval = 0 or negative error number (-EINVAL, -ENOENT if no such version or -EBUSY if not synced) ## BS_OP_SYNC_STAB_ALL diff --git a/blockstore_impl.cpp b/blockstore_impl.cpp index cedbdf69..ba632595 100644 --- a/blockstore_impl.cpp +++ b/blockstore_impl.cpp @@ -322,7 +322,7 @@ void blockstore_impl_t::enqueue_op(blockstore_op_t *op, bool first) { // Basic verification not passed op->retval = -EINVAL; - op->callback(op); + std::function(op->callback)(op); return; } if (op->opcode == BS_OP_SYNC_STAB_ALL) @@ -365,13 +365,13 @@ void blockstore_impl_t::enqueue_op(blockstore_op_t *op, bool first) } if (op->opcode == BS_OP_WRITE && !enqueue_write(op)) { - op->callback(op); + std::function(op->callback)(op); return; } if (op->opcode == BS_OP_SYNC && immediate_commit == IMMEDIATE_ALL) { op->retval = 0; - op->callback(op); + std::function(op->callback)(op); return; } // Call constructor without allocating memory. We'll call destructor before returning op back diff --git a/blockstore_impl.h b/blockstore_impl.h index ff6c915c..745ac261 100644 --- a/blockstore_impl.h +++ b/blockstore_impl.h @@ -141,7 +141,7 @@ struct fulfill_read_t }; #define PRIV(op) ((blockstore_op_private_t*)(op)->private_data) -#define FINISH_OP(op) PRIV(op)->~blockstore_op_private_t(); op->callback(op) +#define FINISH_OP(op) PRIV(op)->~blockstore_op_private_t(); std::function(op->callback)(op) struct blockstore_op_private_t { diff --git a/blockstore_read.cpp b/blockstore_read.cpp index 1de838ee..25ac6b5f 100644 --- a/blockstore_read.cpp +++ b/blockstore_read.cpp @@ -131,63 +131,66 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op) dirty_it--; } } - if (clean_it != clean_db.end() && fulfilled < read_op->len) + if (clean_it != clean_db.end()) { if (!result_version) { result_version = clean_it->second.version; } - if (!clean_entry_bitmap_size) + if (fulfilled < read_op->len) { - if (!fulfill_read(read_op, fulfilled, 0, block_size, ST_CURRENT, 0, clean_it->second.location)) + if (!clean_entry_bitmap_size) { - // need to wait. undo added requests, don't dequeue op - PRIV(read_op)->read_vec.clear(); - return 0; - } - } - else - { - uint64_t meta_loc = clean_it->second.location >> block_order; - uint8_t *clean_entry_bitmap; - if (inmemory_meta) - { - uint64_t sector = (meta_loc / (meta_block_size / clean_entry_size)) * meta_block_size; - uint64_t pos = (meta_loc % (meta_block_size / clean_entry_size)); - clean_entry_bitmap = (uint8_t*)(metadata_buffer + sector + pos*clean_entry_size + sizeof(clean_disk_entry)); + if (!fulfill_read(read_op, fulfilled, 0, block_size, ST_CURRENT, 0, clean_it->second.location)) + { + // need to wait. undo added requests, don't dequeue op + PRIV(read_op)->read_vec.clear(); + return 0; + } } else { - clean_entry_bitmap = (uint8_t*)(clean_bitmap + meta_loc*clean_entry_bitmap_size); - } - uint64_t bmp_start = 0, bmp_end = 0, bmp_size = block_size/bitmap_granularity; - while (bmp_start < bmp_size) - { - while (!(clean_entry_bitmap[bmp_end >> 3] & (1 << (bmp_end & 0x7))) && bmp_end < bmp_size) + uint64_t meta_loc = clean_it->second.location >> block_order; + uint8_t *clean_entry_bitmap; + if (inmemory_meta) { - bmp_end++; + uint64_t sector = (meta_loc / (meta_block_size / clean_entry_size)) * meta_block_size; + uint64_t pos = (meta_loc % (meta_block_size / clean_entry_size)); + clean_entry_bitmap = (uint8_t*)(metadata_buffer + sector + pos*clean_entry_size + sizeof(clean_disk_entry)); } - if (bmp_end > bmp_start) + else { - // fill with zeroes - fulfill_read(read_op, fulfilled, bmp_start * bitmap_granularity, - bmp_end * bitmap_granularity, ST_DEL_STABLE, 0, 0); + clean_entry_bitmap = (uint8_t*)(clean_bitmap + meta_loc*clean_entry_bitmap_size); } - bmp_start = bmp_end; - while (clean_entry_bitmap[bmp_end >> 3] & (1 << (bmp_end & 0x7)) && bmp_end < bmp_size) + uint64_t bmp_start = 0, bmp_end = 0, bmp_size = block_size/bitmap_granularity; + while (bmp_start < bmp_size) { - bmp_end++; - } - if (bmp_end > bmp_start) - { - if (!fulfill_read(read_op, fulfilled, bmp_start * bitmap_granularity, - bmp_end * bitmap_granularity, ST_CURRENT, 0, clean_it->second.location + bmp_start * bitmap_granularity)) + while (!(clean_entry_bitmap[bmp_end >> 3] & (1 << (bmp_end & 0x7))) && bmp_end < bmp_size) { - // need to wait. undo added requests, don't dequeue op - PRIV(read_op)->read_vec.clear(); - return 0; + bmp_end++; + } + if (bmp_end > bmp_start) + { + // fill with zeroes + fulfill_read(read_op, fulfilled, bmp_start * bitmap_granularity, + bmp_end * bitmap_granularity, ST_DEL_STABLE, 0, 0); } bmp_start = bmp_end; + while (clean_entry_bitmap[bmp_end >> 3] & (1 << (bmp_end & 0x7)) && bmp_end < bmp_size) + { + bmp_end++; + } + if (bmp_end > bmp_start) + { + if (!fulfill_read(read_op, fulfilled, bmp_start * bitmap_granularity, + bmp_end * bitmap_granularity, ST_CURRENT, 0, clean_it->second.location + bmp_start * bitmap_granularity)) + { + // need to wait. undo added requests, don't dequeue op + PRIV(read_op)->read_vec.clear(); + return 0; + } + bmp_start = bmp_end; + } } } } diff --git a/blockstore_rollback.cpp b/blockstore_rollback.cpp index d7821e2b..b48c76c9 100644 --- a/blockstore_rollback.cpp +++ b/blockstore_rollback.cpp @@ -24,7 +24,7 @@ int blockstore_impl_t::dequeue_rollback(blockstore_op_t *op) // FIXME Skip this object version } bad_op: - op->retval = -EINVAL; + op->retval = -ENOENT; FINISH_OP(op); return 1; } diff --git a/blockstore_stable.cpp b/blockstore_stable.cpp index e5fba5ab..295cf087 100644 --- a/blockstore_stable.cpp +++ b/blockstore_stable.cpp @@ -55,7 +55,7 @@ int blockstore_impl_t::dequeue_stable(blockstore_op_t *op) if (clean_it == clean_db.end() || clean_it->second.version < v->version) { // No such object version - op->retval = -EINVAL; + op->retval = -ENOENT; FINISH_OP(op); return 1; } diff --git a/blockstore_write.cpp b/blockstore_write.cpp index d9fd2b37..932bed4a 100644 --- a/blockstore_write.cpp +++ b/blockstore_write.cpp @@ -42,7 +42,7 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op) else if (op->version < version) { // Invalid version requested - op->retval = -EINVAL; + op->retval = -EEXIST; return false; } if (deleted && is_del) diff --git a/osd.cpp b/osd.cpp index d23b2233..23124554 100644 --- a/osd.cpp +++ b/osd.cpp @@ -7,7 +7,7 @@ #include "osd.h" -static const char* osd_op_names[] = { +const char* osd_op_names[] = { "", "read", "write", @@ -54,6 +54,18 @@ osd_t::osd_t(blockstore_config_t & config, blockstore_t *bs, ring_loop_t *ringlo send_stat_count = 0; send_stat_sum = 0; } + if (incomplete_objects > 0) + { + printf("%lu object(s) incomplete\n", incomplete_objects); + } + if (degraded_objects > 0) + { + printf("%lu object(s) degraded\n", degraded_objects); + } + if (misplaced_objects > 0) + { + printf("%lu object(s) misplaced\n", misplaced_objects); + } }); this->bs_block_size = bs->get_block_size(); // FIXME: use bitmap granularity instead @@ -301,7 +313,8 @@ void osd_t::cancel_op(osd_op_t *op) op->reply.hdr.id = op->req.hdr.id; op->reply.hdr.opcode = op->req.hdr.opcode; op->reply.hdr.retval = -EPIPE; - op->callback(op); + // Copy lambda to be unaffected by `delete op` + std::function(op->callback)(op); } else { @@ -316,7 +329,16 @@ void osd_t::stop_client(int peer_fd) { return; } - auto & cl = it->second; + osd_client_t cl = it->second; + if (cl.osd_num) + { + printf("[%lu] Stopping client %d (OSD peer %lu)\n", osd_num, peer_fd, cl.osd_num); + } + else + { + printf("[%lu] Stopping client %d (regular client)\n", osd_num, peer_fd); + } + clients.erase(it); if (epoll_ctl(epoll_fd, EPOLL_CTL_DEL, peer_fd, NULL) < 0) { throw std::runtime_error(std::string("epoll_ctl: ") + strerror(errno)); @@ -350,7 +372,6 @@ void osd_t::stop_client(int peer_fd) } } free(cl.in_buf); - clients.erase(it); close(peer_fd); } @@ -372,18 +393,7 @@ void osd_t::exec_op(osd_op_t *cur_op) (cur_op->req.rw.len > OSD_RW_MAX || cur_op->req.rw.len % OSD_RW_ALIGN || cur_op->req.rw.offset % OSD_RW_ALIGN)) { // Bad command - cur_op->reply.hdr.magic = SECONDARY_OSD_REPLY_MAGIC; - cur_op->reply.hdr.id = cur_op->req.hdr.id; - cur_op->reply.hdr.opcode = cur_op->req.hdr.opcode; - cur_op->reply.hdr.retval = -EINVAL; - if (cur_op->peer_fd) - { - outbox_push(this->clients[cur_op->peer_fd], cur_op); - } - else - { - cur_op->callback(cur_op); - } + finish_op(cur_op, -EINVAL); return; } inflight_ops++; diff --git a/osd.h b/osd.h index 2eb17f88..004bee6f 100644 --- a/osd.h +++ b/osd.h @@ -44,6 +44,8 @@ //#define OSD_STUB +extern const char* osd_op_names[]; + struct osd_op_buf_list_t { int count = 0, alloc = 0, sent = 0; @@ -194,6 +196,7 @@ class osd_t std::map osd_peer_fds; std::map pgs; + uint64_t misplaced_objects = 0, degraded_objects = 0, incomplete_objects = 0; int peering_state = 0; unsigned pg_count = 0; uint64_t next_subop_id = 1; diff --git a/osd_flush.cpp b/osd_flush.cpp index 96b74565..711f2eec 100644 --- a/osd_flush.cpp +++ b/osd_flush.cpp @@ -254,6 +254,7 @@ resume_4: auto st_it = pg->degraded_objects.find(recovery_state.oid); st = st_it->second; pg->degraded_objects.erase(st_it); + degraded_objects--; } st->object_count--; if (st->state == OBJ_DEGRADED) diff --git a/osd_peering.cpp b/osd_peering.cpp index 82e78b8a..f8b09597 100644 --- a/osd_peering.cpp +++ b/osd_peering.cpp @@ -176,6 +176,9 @@ void osd_t::handle_peers() if (!p.second.peering_state->list_ops.size()) { p.second.calc_object_states(); + incomplete_objects += p.second.incomplete_objects.size(); + misplaced_objects += p.second.misplaced_objects.size(); + degraded_objects += p.second.degraded_objects.size(); if (p.second.state & PG_HAS_UNCLEAN) peering_state = peering_state | OSD_FLUSHING_PGS; else @@ -256,6 +259,9 @@ void osd_t::start_pg_peering(pg_num_t pg_num) pg.state = PG_PEERING; pg.print_state(); pg.state_dict.clear(); + incomplete_objects -= pg.incomplete_objects.size(); + misplaced_objects -= pg.misplaced_objects.size(); + degraded_objects -= pg.degraded_objects.size(); pg.incomplete_objects.clear(); pg.misplaced_objects.clear(); pg.degraded_objects.clear(); diff --git a/osd_peering_pg.cpp b/osd_peering_pg.cpp index 5d1289aa..0c39b01d 100644 --- a/osd_peering_pg.cpp +++ b/osd_peering_pg.cpp @@ -204,6 +204,14 @@ void pg_obj_state_check_t::finish_object() { printf("Present on: osd %lu, role %ld%s\n", list[i].osd_num, (list[i].oid.stripe & STRIPE_MASK), list[i].is_stable ? " (stable)" : ""); } + if (0) + { + // For future debug level + for (int i = obj_start; i < obj_end; i++) + { + printf("v%lu present on: osd %lu, role %ld%s\n", list[i].version, list[i].osd_num, (list[i].oid.stripe & STRIPE_MASK), list[i].is_stable ? " (stable)" : ""); + } + } state = OBJ_INCOMPLETE; pg->state = pg->state | PG_HAS_INCOMPLETE; } diff --git a/osd_primary.cpp b/osd_primary.cpp index e5f17142..ba79ef2e 100644 --- a/osd_primary.cpp +++ b/osd_primary.cpp @@ -42,7 +42,8 @@ void osd_t::finish_op(osd_op_t *cur_op, int retval) { if (!cur_op->peer_fd) { - cur_op->callback(cur_op); + // Copy lambda to be unaffected by `delete op` + std::function(cur_op->callback)(cur_op); } else { @@ -254,7 +255,7 @@ void osd_t::submit_primary_subops(int submit_type, int pg_size, const uint64_t* if (subop->opcode == BS_OP_WRITE && subop->retval != subop->len) { // die - throw std::runtime_error("local write operation failed"); + throw std::runtime_error("local write operation failed (retval = "+std::to_string(subop->retval)+")"); } handle_primary_subop( subop->opcode == BS_OP_WRITE ? OSD_OP_SECONDARY_WRITE : OSD_OP_SECONDARY_READ, @@ -298,17 +299,19 @@ void osd_t::submit_primary_subops(int submit_type, int pg_size, const uint64_t* } subops[subop].callback = [cur_op, this](osd_op_t *subop) { + int fail_fd = subop->req.hdr.opcode == OSD_OP_SECONDARY_WRITE && + subop->reply.hdr.retval != subop->req.sec_rw.len ? subop->peer_fd : -1; // so it doesn't get freed subop->buf = NULL; - if (subop->req.hdr.opcode == OSD_OP_SECONDARY_WRITE && cur_op->reply.hdr.retval != cur_op->req.sec_rw.len) - { - // write operation failed, drop the connection - stop_client(subop->peer_fd); - } handle_primary_subop( subop->req.hdr.opcode, cur_op, subop->reply.hdr.retval, subop->req.sec_rw.len, subop->reply.sec_rw.version ); + if (fail_fd >= 0) + { + // write operation failed, drop the connection + stop_client(fail_fd); + } }; outbox_push(clients[subops[subop].peer_fd], &subops[subop]); } @@ -322,8 +325,11 @@ void osd_t::handle_primary_subop(uint64_t opcode, osd_op_t *cur_op, int retval, osd_primary_op_data_t *op_data = cur_op->op_data; if (retval != expected) { + printf("%s subop failed: retval = %d (expected %d)\n", osd_op_names[opcode], retval, expected); if (retval == -EPIPE) + { op_data->epipe++; + } op_data->errors++; } else @@ -565,7 +571,7 @@ resume_6: op_data->st = 6; return; resume_7: - // FIXME: Free them correctly (via a destructor or so) + // FIXME: Free those in the destructor? delete op_data->unstable_write_osds; delete[] op_data->unstable_writes; op_data->unstable_writes = NULL; @@ -796,12 +802,13 @@ void osd_t::submit_primary_sync_subops(osd_op_t *cur_op) }; subops[i].callback = [cur_op, this](osd_op_t *subop) { - if (cur_op->reply.hdr.retval != 0) + int fail_fd = subop->reply.hdr.retval != 0 ? subop->peer_fd : 0; + handle_primary_subop(OSD_OP_SECONDARY_SYNC, cur_op, subop->reply.hdr.retval, 0, 0); + if (fail_fd >= 0) { // sync operation failed, drop the connection - stop_client(subop->peer_fd); + stop_client(fail_fd); } - handle_primary_subop(OSD_OP_SECONDARY_SYNC, cur_op, subop->reply.hdr.retval, 0, 0); }; outbox_push(clients[subops[i].peer_fd], &subops[i]); } @@ -853,12 +860,13 @@ void osd_t::submit_primary_stab_subops(osd_op_t *cur_op) subops[i].send_list.push_back(op_data->unstable_writes + stab_osd.start, stab_osd.len * sizeof(obj_ver_id)); subops[i].callback = [cur_op, this](osd_op_t *subop) { - if (cur_op->reply.hdr.retval != 0) + int fail_fd = subop->reply.hdr.retval != 0 ? subop->peer_fd : 0; + handle_primary_subop(OSD_OP_SECONDARY_STABILIZE, cur_op, subop->reply.hdr.retval, 0, 0); + if (fail_fd >= 0) { // sync operation failed, drop the connection - stop_client(subop->peer_fd); + stop_client(fail_fd); } - handle_primary_subop(OSD_OP_SECONDARY_STABILIZE, cur_op, subop->reply.hdr.retval, 0, 0); }; outbox_push(clients[subops[i].peer_fd], &subops[i]); } diff --git a/osd_receive.cpp b/osd_receive.cpp index d68e7045..3b9c3d0d 100644 --- a/osd_receive.cpp +++ b/osd_receive.cpp @@ -247,6 +247,7 @@ void osd_t::handle_reply_hdr(osd_client_t *cl) (tv_end.tv_sec - op->tv_begin.tv_sec)*1000000 + (tv_end.tv_nsec - op->tv_begin.tv_nsec)/1000 ); - op->callback(op); + // Copy lambda to be unaffected by `delete op` + std::function(op->callback)(op); } }