From 0aa2dd2890d945bcb7466f7c6f8f5126a5d0f0d7 Mon Sep 17 00:00:00 2001 From: Vitaliy Filippov Date: Wed, 13 Jan 2021 00:19:04 +0300 Subject: [PATCH] Send bitmaps with primary-reads, actually read bitmaps for READ ops --- src/cluster_client.cpp | 8 ++++++++ src/cluster_client.h | 3 +++ src/msgr_op.h | 1 + src/msgr_receive.cpp | 10 ++++++++-- src/osd_ops.h | 6 +++++- src/osd_primary.cpp | 2 ++ src/osd_primary_subops.cpp | 1 + 7 files changed, 28 insertions(+), 3 deletions(-) diff --git a/src/cluster_client.cpp b/src/cluster_client.cpp index 0a5c20f2b..a1b007ff0 100644 --- a/src/cluster_client.cpp +++ b/src/cluster_client.cpp @@ -63,6 +63,11 @@ cluster_client_t::cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd st_cli.parse_config(config); st_cli.load_global_config(); + // Temporary implementation: discard all bitmaps + // It will be of course replaced by the implementation of snapshots + scrap_bitmap_size = 4096; + scrap_bitmap = malloc_or_die(scrap_bitmap_size); + if (ringloop) { consumer.loop = [this]() @@ -86,6 +91,7 @@ cluster_client_t::~cluster_client_t() { ringloop->unregister_consumer(&consumer); } + free(scrap_bitmap); } void cluster_client_t::continue_ops(bool up_retry) @@ -681,6 +687,8 @@ bool cluster_client_t::try_send(cluster_op_t *op, int i) .offset = part->offset, .len = part->len, } }, + .bitmap = scrap_bitmap, + .bitmap_len = scrap_bitmap_size, .callback = [this, part](osd_op_t *op_part) { handle_op_part(part); diff --git a/src/cluster_client.h b/src/cluster_client.h index fc00b3631..31d5f8f82 100644 --- a/src/cluster_client.h +++ b/src/cluster_client.h @@ -77,6 +77,9 @@ class cluster_client_t std::set dirty_osds; uint64_t dirty_bytes = 0, dirty_ops = 0; + void *scrap_bitmap = NULL; + unsigned scrap_bitmap_size = 0; + bool pgs_loaded = false; ring_consumer_t consumer; std::vector> on_ready_hooks; diff --git a/src/msgr_op.h b/src/msgr_op.h index b2d268f16..18667e040 100644 --- a/src/msgr_op.h +++ b/src/msgr_op.h @@ -162,6 +162,7 @@ struct osd_op_t blockstore_op_t *bs_op = NULL; void *buf = NULL; void *bitmap = NULL; + unsigned bitmap_len = 0; unsigned bmp_data = 0; void *rmw_buf = NULL; osd_primary_op_data_t* op_data = NULL; diff --git a/src/msgr_receive.cpp b/src/msgr_receive.cpp index 6f0d11889..7bff6093e 100644 --- a/src/msgr_receive.cpp +++ b/src/msgr_receive.cpp @@ -278,7 +278,9 @@ bool osd_messenger_t::handle_reply_hdr(osd_client_t *cl) { // Read data. In this case we assume that the buffer is preallocated by the caller (!) assert(op->iov.count > 0); - if (op->reply.hdr.retval != (op->reply.hdr.opcode == OSD_OP_SEC_READ ? op->req.sec_rw.len : op->req.rw.len)) + unsigned bmp_len = (op->reply.hdr.opcode == OSD_OP_SEC_READ ? op->reply.sec_rw.attr_len : op->reply.rw.bitmap_len); + if (op->reply.hdr.retval != (op->reply.hdr.opcode == OSD_OP_SEC_READ ? op->req.sec_rw.len : op->req.rw.len) || + bmp_len > op->bitmap_len) { // Check reply length to not overflow the buffer printf("Client %d read reply of different length\n", cl->peer_fd); @@ -286,11 +288,15 @@ bool osd_messenger_t::handle_reply_hdr(osd_client_t *cl) stop_client(cl->peer_fd); return false; } + if (bmp_len > 0) + { + cl->recv_list.push_back(op->bitmap, bmp_len); + } cl->recv_list.append(op->iov); delete cl->read_op; cl->read_op = op; cl->read_state = CL_READ_REPLY_DATA; - cl->read_remaining = op->reply.hdr.retval; + cl->read_remaining = op->reply.hdr.retval + (op->reply.hdr.opcode == OSD_OP_SEC_READ ? op->reply.sec_rw.attr_len : op->reply.rw.bitmap_len); } else if (op->reply.hdr.opcode == OSD_OP_SEC_LIST && op->reply.hdr.retval > 0) { diff --git a/src/osd_ops.h b/src/osd_ops.h index fece585f1..c4cb97bf6 100644 --- a/src/osd_ops.h +++ b/src/osd_ops.h @@ -73,6 +73,7 @@ struct __attribute__((__packed__)) osd_op_secondary_rw_t uint32_t len; // bitmap/attribute length - bitmap comes after header, but before data uint32_t attr_len; + uint32_t pad0; }; struct __attribute__((__packed__)) osd_reply_secondary_rw_t @@ -82,6 +83,7 @@ struct __attribute__((__packed__)) osd_reply_secondary_rw_t uint64_t version; // for reads: bitmap/attribute length (just to double-check) uint32_t attr_len; + uint32_t pad0; }; // delete object on the secondary OSD @@ -158,7 +160,6 @@ struct __attribute__((__packed__)) osd_reply_secondary_list_t }; // read or write to the primary OSD (must be within individual stripe) -// FIXME: allow to return used block bitmap (required for snapshots) struct __attribute__((__packed__)) osd_op_rw_t { osd_op_header_t header; @@ -173,6 +174,9 @@ struct __attribute__((__packed__)) osd_op_rw_t struct __attribute__((__packed__)) osd_reply_rw_t { osd_reply_header_t header; + // for reads: bitmap length + uint32_t bitmap_len; + uint32_t pad0; }; // sync to the primary OSD diff --git a/src/osd_primary.cpp b/src/osd_primary.cpp index 9c10bf291..f16deb383 100644 --- a/src/osd_primary.cpp +++ b/src/osd_primary.cpp @@ -179,6 +179,8 @@ resume_2: } else { + cur_op->reply.rw.bitmap_len = op_data->pg_data_size * entry_attr_size; + cur_op->iov.push_back(op_data->stripes[0].bmp_buf, cur_op->reply.rw.bitmap_len); cur_op->iov.push_back(cur_op->buf, cur_op->req.rw.len); } finish_op(cur_op, cur_op->req.rw.len); diff --git a/src/osd_primary_subops.cpp b/src/osd_primary_subops.cpp index 0a81ebf4f..8fd034381 100644 --- a/src/osd_primary_subops.cpp +++ b/src/osd_primary_subops.cpp @@ -161,6 +161,7 @@ void osd_t::submit_primary_subops(int submit_type, uint64_t op_version, int pg_s subops[i].op_type = OSD_OP_OUT; subops[i].peer_fd = c_cli.osd_peer_fds.at(role_osd_num); subops[i].bitmap = stripes[stripe_num].bmp_buf; + subops[i].bitmap_len = entry_attr_size; subops[i].req.sec_rw = { .header = { .magic = SECONDARY_OSD_OP_MAGIC,