From 8faf8f7b58e8ffcc6f20b9d40af3c3926efd9234 Mon Sep 17 00:00:00 2001 From: Vitaliy Filippov Date: Sun, 25 Apr 2021 19:49:36 +0300 Subject: [PATCH] Inline bitmaps Handy for zero-copy RDMA tests (removes 4-byte s/g entries) --- src/msgr_receive.cpp | 32 ++++++++++++++++++++++++-------- src/msgr_send.cpp | 38 ++++++++++++++++++++++++++------------ src/osd_ops.h | 12 +++++++++--- src/osd_primary.cpp | 10 ++++++++-- src/osd_primary_subops.cpp | 2 +- src/osd_secondary.cpp | 6 +++--- 6 files changed, 71 insertions(+), 29 deletions(-) diff --git a/src/msgr_receive.cpp b/src/msgr_receive.cpp index bf8ed1b7..7250830a 100644 --- a/src/msgr_receive.cpp +++ b/src/msgr_receive.cpp @@ -207,20 +207,26 @@ void osd_messenger_t::handle_op_hdr(osd_client_t *cl) else if (cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE || cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE_STABLE) { - if (cur_op->req.sec_rw.attr_len > 0) + if (cur_op->req.sec_rw.bitmap_len > 0) { - if (cur_op->req.sec_rw.attr_len > sizeof(unsigned)) - cur_op->bitmap = cur_op->rmw_buf = malloc_or_die(cur_op->req.sec_rw.attr_len); + if (cur_op->req.sec_rw.bitmap_len > sizeof(void*)) + cur_op->bitmap = cur_op->rmw_buf = malloc_or_die(cur_op->req.sec_rw.bitmap_len); else cur_op->bitmap = &cur_op->bmp_data; - cl->recv_list.push_back(cur_op->bitmap, cur_op->req.sec_rw.attr_len); + if (cur_op->req.sec_rw.bitmap_len <= 8) + memcpy(cur_op->bitmap, &cur_op->req.sec_rw.bitmap, cur_op->req.sec_rw.bitmap_len); + else + { + cl->recv_list.push_back(cur_op->bitmap, cur_op->req.sec_rw.bitmap_len); + cl->read_remaining += cur_op->req.sec_rw.bitmap_len; + } } if (cur_op->req.sec_rw.len > 0) { cur_op->buf = memalign_or_die(MEM_ALIGNMENT, cur_op->req.sec_rw.len); cl->recv_list.push_back(cur_op->buf, cur_op->req.sec_rw.len); + cl->read_remaining += cur_op->req.sec_rw.len; } - cl->read_remaining = cur_op->req.sec_rw.len + cur_op->req.sec_rw.attr_len; } else if (cur_op->req.hdr.opcode == OSD_OP_SEC_STABILIZE || cur_op->req.hdr.opcode == OSD_OP_SEC_ROLLBACK) @@ -295,7 +301,7 @@ bool osd_messenger_t::handle_reply_hdr(osd_client_t *cl) if (op->reply.hdr.opcode == OSD_OP_SEC_READ || op->reply.hdr.opcode == OSD_OP_READ) { // Read data. In this case we assume that the buffer is preallocated by the caller (!) - unsigned bmp_len = (op->reply.hdr.opcode == OSD_OP_SEC_READ ? op->reply.sec_rw.attr_len : op->reply.rw.bitmap_len); + unsigned bmp_len = (op->reply.hdr.opcode == OSD_OP_SEC_READ ? op->reply.sec_rw.bitmap_len : op->reply.rw.bitmap_len); unsigned expected_size = (op->reply.hdr.opcode == OSD_OP_SEC_READ ? op->req.sec_rw.len : op->req.rw.len); if (op->reply.hdr.retval >= 0 && (op->reply.hdr.retval != expected_size || bmp_len > op->bitmap_len)) { @@ -309,14 +315,24 @@ bool osd_messenger_t::handle_reply_hdr(osd_client_t *cl) if (op->reply.hdr.retval >= 0 && bmp_len > 0) { assert(op->bitmap); - cl->recv_list.push_back(op->bitmap, bmp_len); + if (bmp_len <= 8) + { + memcpy(op->bitmap, (op->reply.hdr.opcode == OSD_OP_SEC_READ + ? &op->reply.sec_rw.bitmap + : &op->reply.rw.bitmap), bmp_len); + } + else + { + cl->recv_list.push_back(op->bitmap, bmp_len); + cl->read_remaining += bmp_len; + } } if (op->reply.hdr.retval > 0) { assert(op->iov.count > 0); cl->recv_list.append(op->iov); + cl->read_remaining += op->reply.hdr.retval; } - cl->read_remaining = op->reply.hdr.retval + bmp_len; if (cl->read_remaining == 0) { goto reuse; diff --git a/src/msgr_send.cpp b/src/msgr_send.cpp index cd06c4ec..564c72fc 100644 --- a/src/msgr_send.cpp +++ b/src/msgr_send.cpp @@ -50,23 +50,37 @@ void osd_messenger_t::outbox_push(osd_op_t *cur_op) // Bitmap if (cur_op->op_type == OSD_OP_IN && cur_op->req.hdr.opcode == OSD_OP_SEC_READ && - cur_op->reply.sec_rw.attr_len > 0) + cur_op->reply.sec_rw.bitmap_len > 0) { - to_send_list.push_back((iovec){ - .iov_base = cur_op->bitmap, - .iov_len = cur_op->reply.sec_rw.attr_len, - }); - to_outbox.push_back((msgr_sendp_t){ .op = cur_op, .flags = 0 }); + if (cur_op->reply.sec_rw.bitmap_len <= 8) + { + memcpy(&cur_op->reply.sec_rw.bitmap, cur_op->bitmap, cur_op->reply.sec_rw.bitmap_len); + } + else + { + to_send_list.push_back((iovec){ + .iov_base = cur_op->bitmap, + .iov_len = cur_op->reply.sec_rw.bitmap_len, + }); + to_outbox.push_back((msgr_sendp_t){ .op = cur_op, .flags = 0 }); + } } else if (cur_op->op_type == OSD_OP_OUT && (cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE || cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE_STABLE) && - cur_op->req.sec_rw.attr_len > 0) + cur_op->req.sec_rw.bitmap_len > 0) { - to_send_list.push_back((iovec){ - .iov_base = cur_op->bitmap, - .iov_len = cur_op->req.sec_rw.attr_len, - }); - to_outbox.push_back((msgr_sendp_t){ .op = cur_op, .flags = 0 }); + if (cur_op->req.sec_rw.bitmap_len <= 8) + { + memcpy(&cur_op->req.sec_rw.bitmap, cur_op->bitmap, cur_op->req.sec_rw.bitmap_len); + } + else + { + to_send_list.push_back((iovec){ + .iov_base = cur_op->bitmap, + .iov_len = cur_op->req.sec_rw.attr_len, + }); + to_outbox.push_back((msgr_sendp_t){ .op = cur_op, .flags = 0 }); + } } // Operation data if ((cur_op->op_type == OSD_OP_IN diff --git a/src/osd_ops.h b/src/osd_ops.h index e8078b71..754cdd2e 100644 --- a/src/osd_ops.h +++ b/src/osd_ops.h @@ -35,7 +35,7 @@ #define MEM_ALIGNMENT 512 #endif #define OSD_RW_MAX 64*1024*1024 -#define OSD_PROTOCOL_VERSION 1 +#define OSD_PROTOCOL_VERSION 2 // common request and reply headers struct __attribute__((__packed__)) osd_op_header_t @@ -74,8 +74,10 @@ struct __attribute__((__packed__)) osd_op_sec_rw_t // length uint32_t len; // bitmap/attribute length - bitmap comes after header, but before data - uint32_t attr_len; + uint32_t bitmap_len; uint32_t pad0; + // inline bitmap (when it's no longer than 8 bytes) + uint64_t bitmap; }; struct __attribute__((__packed__)) osd_reply_sec_rw_t @@ -84,8 +86,10 @@ struct __attribute__((__packed__)) osd_reply_sec_rw_t // for reads and writes: assigned or read version number uint64_t version; // for reads: bitmap/attribute length (just to double-check) - uint32_t attr_len; + uint32_t bitmap_len; uint32_t pad0; + // inline bitmap (when it's no longer than 8 bytes) + uint64_t bitmap; }; // delete object on the secondary OSD @@ -199,6 +203,8 @@ struct __attribute__((__packed__)) osd_reply_rw_t // for reads: bitmap length uint32_t bitmap_len; uint32_t pad0; + // inline bitmap (when it's no longer than 8 bytes) + uint64_t bitmap; }; // sync to the primary OSD diff --git a/src/osd_primary.cpp b/src/osd_primary.cpp index 3a3e685e..858461bc 100644 --- a/src/osd_primary.cpp +++ b/src/osd_primary.cpp @@ -235,7 +235,10 @@ resume_2: { reconstruct_stripes_jerasure(stripes, op_data->pg_size, op_data->pg_data_size, clean_entry_bitmap_size); } - cur_op->iov.push_back(op_data->stripes[0].bmp_buf, cur_op->reply.rw.bitmap_len); + if (cur_op->reply.rw.bitmap_len <= 8) + memcpy(&cur_op->reply.rw.bitmap, op_data->stripes[0].bmp_buf, cur_op->reply.rw.bitmap_len); + else + cur_op->iov.push_back(op_data->stripes[0].bmp_buf, cur_op->reply.rw.bitmap_len); for (int role = 0; role < op_data->pg_size; role++) { if (stripes[role].req_end != 0) @@ -250,7 +253,10 @@ resume_2: } else { - cur_op->iov.push_back(op_data->stripes[0].bmp_buf, cur_op->reply.rw.bitmap_len); + if (cur_op->reply.rw.bitmap_len <= 8) + memcpy(&cur_op->reply.rw.bitmap, op_data->stripes[0].bmp_buf, cur_op->reply.rw.bitmap_len); + else + cur_op->iov.push_back(op_data->stripes[0].bmp_buf, cur_op->reply.rw.bitmap_len); cur_op->iov.push_back(cur_op->buf, cur_op->req.rw.len); } finish_op(cur_op, cur_op->req.rw.len); diff --git a/src/osd_primary_subops.cpp b/src/osd_primary_subops.cpp index 9b8eff08..59424412 100644 --- a/src/osd_primary_subops.cpp +++ b/src/osd_primary_subops.cpp @@ -200,7 +200,7 @@ int osd_t::submit_primary_subop_batch(int submit_type, inode_t inode, uint64_t o .version = op_version, .offset = wr ? stripes[stripe_num].write_start : stripes[stripe_num].read_start, .len = wr ? stripes[stripe_num].write_end - stripes[stripe_num].write_start : stripes[stripe_num].read_end - stripes[stripe_num].read_start, - .attr_len = wr ? clean_entry_bitmap_size : 0, + .bitmap_len = wr ? clean_entry_bitmap_size : 0, }; #ifdef OSD_DEBUG printf( diff --git a/src/osd_secondary.cpp b/src/osd_secondary.cpp index 106b61de..6dc5a6ca 100644 --- a/src/osd_secondary.cpp +++ b/src/osd_secondary.cpp @@ -20,9 +20,9 @@ void osd_t::secondary_op_callback(osd_op_t *op) if (op->req.hdr.opcode == OSD_OP_SEC_READ) { if (op->bs_op->retval >= 0) - op->reply.sec_rw.attr_len = clean_entry_bitmap_size; + op->reply.sec_rw.bitmap_len = clean_entry_bitmap_size; else - op->reply.sec_rw.attr_len = 0; + op->reply.sec_rw.bitmap_len = 0; if (op->bs_op->retval > 0) op->iov.push_back(op->buf, op->bs_op->retval); } @@ -81,7 +81,7 @@ void osd_t::exec_secondary(osd_op_t *cur_op) if (cur_op->req.hdr.opcode == OSD_OP_SEC_READ) { // Allocate memory for the read operation - if (clean_entry_bitmap_size > sizeof(unsigned)) + if (clean_entry_bitmap_size > sizeof(void*)) cur_op->bitmap = cur_op->rmw_buf = malloc_or_die(clean_entry_bitmap_size); else cur_op->bitmap = &cur_op->bmp_data;