Send bitmaps with primary-reads, actually read bitmaps for READ ops

rdma-zerocopy
Vitaliy Filippov 2021-01-13 00:19:04 +03:00
parent 6bf88883ac
commit 0aa2dd2890
7 changed files with 28 additions and 3 deletions

View File

@ -63,6 +63,11 @@ cluster_client_t::cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd
st_cli.parse_config(config); st_cli.parse_config(config);
st_cli.load_global_config(); st_cli.load_global_config();
// Temporary implementation: discard all bitmaps
// It will be of course replaced by the implementation of snapshots
scrap_bitmap_size = 4096;
scrap_bitmap = malloc_or_die(scrap_bitmap_size);
if (ringloop) if (ringloop)
{ {
consumer.loop = [this]() consumer.loop = [this]()
@ -86,6 +91,7 @@ cluster_client_t::~cluster_client_t()
{ {
ringloop->unregister_consumer(&consumer); ringloop->unregister_consumer(&consumer);
} }
free(scrap_bitmap);
} }
void cluster_client_t::continue_ops(bool up_retry) void cluster_client_t::continue_ops(bool up_retry)
@ -681,6 +687,8 @@ bool cluster_client_t::try_send(cluster_op_t *op, int i)
.offset = part->offset, .offset = part->offset,
.len = part->len, .len = part->len,
} }, } },
.bitmap = scrap_bitmap,
.bitmap_len = scrap_bitmap_size,
.callback = [this, part](osd_op_t *op_part) .callback = [this, part](osd_op_t *op_part)
{ {
handle_op_part(part); handle_op_part(part);

View File

@ -77,6 +77,9 @@ class cluster_client_t
std::set<osd_num_t> dirty_osds; std::set<osd_num_t> dirty_osds;
uint64_t dirty_bytes = 0, dirty_ops = 0; uint64_t dirty_bytes = 0, dirty_ops = 0;
void *scrap_bitmap = NULL;
unsigned scrap_bitmap_size = 0;
bool pgs_loaded = false; bool pgs_loaded = false;
ring_consumer_t consumer; ring_consumer_t consumer;
std::vector<std::function<void(void)>> on_ready_hooks; std::vector<std::function<void(void)>> on_ready_hooks;

View File

@ -162,6 +162,7 @@ struct osd_op_t
blockstore_op_t *bs_op = NULL; blockstore_op_t *bs_op = NULL;
void *buf = NULL; void *buf = NULL;
void *bitmap = NULL; void *bitmap = NULL;
unsigned bitmap_len = 0;
unsigned bmp_data = 0; unsigned bmp_data = 0;
void *rmw_buf = NULL; void *rmw_buf = NULL;
osd_primary_op_data_t* op_data = NULL; osd_primary_op_data_t* op_data = NULL;

View File

@ -278,7 +278,9 @@ bool osd_messenger_t::handle_reply_hdr(osd_client_t *cl)
{ {
// Read data. In this case we assume that the buffer is preallocated by the caller (!) // Read data. In this case we assume that the buffer is preallocated by the caller (!)
assert(op->iov.count > 0); assert(op->iov.count > 0);
if (op->reply.hdr.retval != (op->reply.hdr.opcode == OSD_OP_SEC_READ ? op->req.sec_rw.len : op->req.rw.len)) unsigned bmp_len = (op->reply.hdr.opcode == OSD_OP_SEC_READ ? op->reply.sec_rw.attr_len : op->reply.rw.bitmap_len);
if (op->reply.hdr.retval != (op->reply.hdr.opcode == OSD_OP_SEC_READ ? op->req.sec_rw.len : op->req.rw.len) ||
bmp_len > op->bitmap_len)
{ {
// Check reply length to not overflow the buffer // Check reply length to not overflow the buffer
printf("Client %d read reply of different length\n", cl->peer_fd); printf("Client %d read reply of different length\n", cl->peer_fd);
@ -286,11 +288,15 @@ bool osd_messenger_t::handle_reply_hdr(osd_client_t *cl)
stop_client(cl->peer_fd); stop_client(cl->peer_fd);
return false; return false;
} }
if (bmp_len > 0)
{
cl->recv_list.push_back(op->bitmap, bmp_len);
}
cl->recv_list.append(op->iov); cl->recv_list.append(op->iov);
delete cl->read_op; delete cl->read_op;
cl->read_op = op; cl->read_op = op;
cl->read_state = CL_READ_REPLY_DATA; cl->read_state = CL_READ_REPLY_DATA;
cl->read_remaining = op->reply.hdr.retval; cl->read_remaining = op->reply.hdr.retval + (op->reply.hdr.opcode == OSD_OP_SEC_READ ? op->reply.sec_rw.attr_len : op->reply.rw.bitmap_len);
} }
else if (op->reply.hdr.opcode == OSD_OP_SEC_LIST && op->reply.hdr.retval > 0) else if (op->reply.hdr.opcode == OSD_OP_SEC_LIST && op->reply.hdr.retval > 0)
{ {

View File

@ -73,6 +73,7 @@ struct __attribute__((__packed__)) osd_op_secondary_rw_t
uint32_t len; uint32_t len;
// bitmap/attribute length - bitmap comes after header, but before data // bitmap/attribute length - bitmap comes after header, but before data
uint32_t attr_len; uint32_t attr_len;
uint32_t pad0;
}; };
struct __attribute__((__packed__)) osd_reply_secondary_rw_t struct __attribute__((__packed__)) osd_reply_secondary_rw_t
@ -82,6 +83,7 @@ struct __attribute__((__packed__)) osd_reply_secondary_rw_t
uint64_t version; uint64_t version;
// for reads: bitmap/attribute length (just to double-check) // for reads: bitmap/attribute length (just to double-check)
uint32_t attr_len; uint32_t attr_len;
uint32_t pad0;
}; };
// delete object on the secondary OSD // delete object on the secondary OSD
@ -158,7 +160,6 @@ struct __attribute__((__packed__)) osd_reply_secondary_list_t
}; };
// read or write to the primary OSD (must be within individual stripe) // read or write to the primary OSD (must be within individual stripe)
// FIXME: allow to return used block bitmap (required for snapshots)
struct __attribute__((__packed__)) osd_op_rw_t struct __attribute__((__packed__)) osd_op_rw_t
{ {
osd_op_header_t header; osd_op_header_t header;
@ -173,6 +174,9 @@ struct __attribute__((__packed__)) osd_op_rw_t
struct __attribute__((__packed__)) osd_reply_rw_t struct __attribute__((__packed__)) osd_reply_rw_t
{ {
osd_reply_header_t header; osd_reply_header_t header;
// for reads: bitmap length
uint32_t bitmap_len;
uint32_t pad0;
}; };
// sync to the primary OSD // sync to the primary OSD

View File

@ -179,6 +179,8 @@ resume_2:
} }
else else
{ {
cur_op->reply.rw.bitmap_len = op_data->pg_data_size * entry_attr_size;
cur_op->iov.push_back(op_data->stripes[0].bmp_buf, cur_op->reply.rw.bitmap_len);
cur_op->iov.push_back(cur_op->buf, cur_op->req.rw.len); cur_op->iov.push_back(cur_op->buf, cur_op->req.rw.len);
} }
finish_op(cur_op, cur_op->req.rw.len); finish_op(cur_op, cur_op->req.rw.len);

View File

@ -161,6 +161,7 @@ void osd_t::submit_primary_subops(int submit_type, uint64_t op_version, int pg_s
subops[i].op_type = OSD_OP_OUT; subops[i].op_type = OSD_OP_OUT;
subops[i].peer_fd = c_cli.osd_peer_fds.at(role_osd_num); subops[i].peer_fd = c_cli.osd_peer_fds.at(role_osd_num);
subops[i].bitmap = stripes[stripe_num].bmp_buf; subops[i].bitmap = stripes[stripe_num].bmp_buf;
subops[i].bitmap_len = entry_attr_size;
subops[i].req.sec_rw = { subops[i].req.sec_rw = {
.header = { .header = {
.magic = SECONDARY_OSD_OP_MAGIC, .magic = SECONDARY_OSD_OP_MAGIC,