Remove cryptic bitmap inlining from bs_op_t and osd_op_t, use bitmap in primary OSD code

rdma-zerocopy
Vitaliy Filippov 2021-01-12 01:02:56 +03:00
parent 860ac24762
commit 004f265393
11 changed files with 41 additions and 40 deletions

View File

@ -65,10 +65,9 @@ Input:
- offset, len = offset and length within object. length may be zero, in that case - offset, len = offset and length within object. length may be zero, in that case
read operation only returns the version / write operation only bumps the version read operation only returns the version / write operation only bumps the version
- buf = pre-allocated buffer for data (read) / with data (write). may be NULL if len == 0. - buf = pre-allocated buffer for data (read) / with data (write). may be NULL if len == 0.
- bitmap = <entry_attr_size> bytes long arbitrary data stored for each object in the metadata area. - bitmap = pointer to <entry_attr_size> bytes long (usually very short) arbitrary data
when <entry_attr_size> fits into pointer size, it should be passed as this field's value. stored for each object in the metadata area.
when it doesn't fit, this field should be a pointer to that piece of data. Called "bitmap" because it's used for the "external bitmap" in Vitastor.
named "bitmap" because it's used for the "external bitmap" in Vitastor.
Output: Output:
- retval = number of bytes actually read/written or negative error number (-EINVAL or -ENOSPC) - retval = number of bytes actually read/written or negative error number (-EINVAL or -ENOSPC)

View File

@ -149,10 +149,11 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
if (!result_version) if (!result_version)
{ {
result_version = dirty_it->first.version; result_version = dirty_it->first.version;
if (entry_attr_size <= sizeof(void*)) if (read_op->bitmap)
read_op->bitmap = dirty_it->second.bitmap; {
else if (read_op->bitmap) void *bmp_ptr = (entry_attr_size > sizeof(void*) ? dirty_it->second.bitmap : &dirty_it->second.bitmap);
memcpy(read_op->bitmap, dirty_it->second.bitmap, entry_attr_size); memcpy(read_op->bitmap, bmp_ptr, entry_attr_size);
}
} }
if (!fulfill_read(read_op, fulfilled, dirty.offset, dirty.offset + dirty.len, if (!fulfill_read(read_op, fulfilled, dirty.offset, dirty.offset + dirty.len,
dirty.state, dirty_it->first.version, dirty.location + (IS_JOURNAL(dirty.state) ? 0 : dirty.offset))) dirty.state, dirty_it->first.version, dirty.location + (IS_JOURNAL(dirty.state) ? 0 : dirty.offset)))
@ -174,11 +175,11 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
if (!result_version) if (!result_version)
{ {
result_version = clean_it->second.version; result_version = clean_it->second.version;
void *clean_entry_bitmap = get_clean_entry_bitmap(clean_it->second.location, clean_entry_bitmap_size); if (read_op->bitmap)
if (entry_attr_size <= sizeof(void*)) {
memcpy(&read_op->bitmap, clean_entry_bitmap, entry_attr_size); void *bmp_ptr = get_clean_entry_bitmap(clean_it->second.location, clean_entry_bitmap_size);
else if (read_op->bitmap) memcpy(read_op->bitmap, bmp_ptr, entry_attr_size);
memcpy(read_op->bitmap, clean_entry_bitmap, entry_attr_size); }
} }
if (fulfilled < read_op->len) if (fulfilled < read_op->len)
{ {

View File

@ -111,13 +111,9 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
if (op->opcode == BS_OP_WRITE_STABLE) if (op->opcode == BS_OP_WRITE_STABLE)
state |= BS_ST_INSTANT; state |= BS_ST_INSTANT;
if (entry_attr_size > sizeof(void*)) if (entry_attr_size > sizeof(void*))
{
bmp = calloc_or_die(1, entry_attr_size); bmp = calloc_or_die(1, entry_attr_size);
if (op->bitmap) if (op->bitmap)
memcpy(bmp, op->bitmap, entry_attr_size); memcpy((entry_attr_size > sizeof(void*) ? bmp : &bmp), op->bitmap, entry_attr_size);
}
else
bmp = op->bitmap;
} }
dirty_db.emplace((obj_ver_id){ dirty_db.emplace((obj_ver_id){
.oid = op->oid, .oid = op->oid,

View File

@ -162,6 +162,7 @@ struct osd_op_t
blockstore_op_t *bs_op = NULL; blockstore_op_t *bs_op = NULL;
void *buf = NULL; void *buf = NULL;
void *bitmap = NULL; void *bitmap = NULL;
unsigned bmp_data = 0;
void *rmw_buf = NULL; void *rmw_buf = NULL;
osd_primary_op_data_t* op_data = NULL; osd_primary_op_data_t* op_data = NULL;
std::function<void(osd_op_t*)> callback; std::function<void(osd_op_t*)> callback;

View File

@ -209,13 +209,11 @@ void osd_messenger_t::handle_op_hdr(osd_client_t *cl)
{ {
if (cur_op->req.sec_rw.attr_len > 0) if (cur_op->req.sec_rw.attr_len > 0)
{ {
if (cur_op->req.sec_rw.attr_len > sizeof(void*)) if (cur_op->req.sec_rw.attr_len > sizeof(unsigned))
{
cur_op->bitmap = cur_op->rmw_buf = malloc_or_die(cur_op->req.sec_rw.attr_len); cur_op->bitmap = cur_op->rmw_buf = malloc_or_die(cur_op->req.sec_rw.attr_len);
cl->recv_list.push_back(cur_op->bitmap, cur_op->req.sec_rw.attr_len);
}
else else
cl->recv_list.push_back(&cur_op->bitmap, cur_op->req.sec_rw.attr_len); cur_op->bitmap = &cur_op->bmp_data;
cl->recv_list.push_back(cur_op->bitmap, cur_op->req.sec_rw.attr_len);
} }
if (cur_op->req.sec_rw.len > 0) if (cur_op->req.sec_rw.len > 0)
{ {

View File

@ -66,7 +66,7 @@ void osd_messenger_t::outbox_push(osd_op_t *cur_op)
cur_op->req.sec_rw.attr_len > 0) cur_op->req.sec_rw.attr_len > 0)
{ {
to_send_list.push_back((iovec){ to_send_list.push_back((iovec){
.iov_base = (cur_op->reply.sec_rw.attr_len > sizeof(void*) ? cur_op->bitmap : &cur_op->bitmap), .iov_base = cur_op->bitmap,
.iov_len = cur_op->reply.sec_rw.attr_len, .iov_len = cur_op->reply.sec_rw.attr_len,
}); });
to_outbox.push_back(NULL); to_outbox.push_back(NULL);

View File

@ -2,6 +2,7 @@
// License: VNPL-1.1 (see README.md for details) // License: VNPL-1.1 (see README.md for details)
#include "osd_primary.h" #include "osd_primary.h"
#include "allocator.h"
// read: read directly or read paired stripe(s), reconstruct, return // read: read directly or read paired stripe(s), reconstruct, return
// write: read paired stripe(s), reconstruct, modify, calculate parity, write // write: read paired stripe(s), reconstruct, modify, calculate parity, write
@ -51,7 +52,8 @@ bool osd_t::prepare_primary_rw(osd_op_t *cur_op)
return false; return false;
} }
osd_primary_op_data_t *op_data = (osd_primary_op_data_t*)calloc_or_die( osd_primary_op_data_t *op_data = (osd_primary_op_data_t*)calloc_or_die(
1, sizeof(osd_primary_op_data_t) + sizeof(osd_rmw_stripe_t) * (pool_cfg.scheme == POOL_SCHEME_REPLICATED ? 1 : pg_it->second.pg_size) 1, sizeof(osd_primary_op_data_t) + entry_attr_size +
sizeof(osd_rmw_stripe_t) * (pool_cfg.scheme == POOL_SCHEME_REPLICATED ? 1 : pg_it->second.pg_size)
); );
op_data->pg_num = pg_num; op_data->pg_num = pg_num;
op_data->oid = oid; op_data->oid = oid;
@ -115,7 +117,7 @@ void osd_t::continue_primary_read(osd_op_t *cur_op)
if (pg.state == PG_ACTIVE || op_data->scheme == POOL_SCHEME_REPLICATED) if (pg.state == PG_ACTIVE || op_data->scheme == POOL_SCHEME_REPLICATED)
{ {
// Fast happy-path // Fast happy-path
cur_op->buf = alloc_read_buffer(op_data->stripes, op_data->pg_data_size, 0, 0); cur_op->buf = alloc_read_buffer(op_data->stripes, op_data->pg_data_size, 0, entry_attr_size);
submit_primary_subops(SUBMIT_READ, op_data->target_ver, submit_primary_subops(SUBMIT_READ, op_data->target_ver,
(op_data->scheme == POOL_SCHEME_REPLICATED ? pg.pg_size : op_data->pg_data_size), pg.cur_set.data(), cur_op); (op_data->scheme == POOL_SCHEME_REPLICATED ? pg.pg_size : op_data->pg_data_size), pg.cur_set.data(), cur_op);
op_data->st = 1; op_data->st = 1;
@ -133,7 +135,7 @@ void osd_t::continue_primary_read(osd_op_t *cur_op)
op_data->pg_size = pg.pg_size; op_data->pg_size = pg.pg_size;
op_data->scheme = pg.scheme; op_data->scheme = pg.scheme;
op_data->degraded = 1; op_data->degraded = 1;
cur_op->buf = alloc_read_buffer(op_data->stripes, pg.pg_size, 0, 0); cur_op->buf = alloc_read_buffer(op_data->stripes, pg.pg_size, 0, entry_attr_size);
submit_primary_subops(SUBMIT_READ, op_data->target_ver, pg.pg_size, cur_set, cur_op); submit_primary_subops(SUBMIT_READ, op_data->target_ver, pg.pg_size, cur_set, cur_op);
op_data->st = 1; op_data->st = 1;
} }
@ -152,11 +154,11 @@ resume_2:
osd_rmw_stripe_t *stripes = op_data->stripes; osd_rmw_stripe_t *stripes = op_data->stripes;
if (op_data->scheme == POOL_SCHEME_XOR) if (op_data->scheme == POOL_SCHEME_XOR)
{ {
reconstruct_stripes_xor(stripes, op_data->pg_size, 0); reconstruct_stripes_xor(stripes, op_data->pg_size, entry_attr_size);
} }
else if (op_data->scheme == POOL_SCHEME_JERASURE) else if (op_data->scheme == POOL_SCHEME_JERASURE)
{ {
reconstruct_stripes_jerasure(stripes, op_data->pg_size, op_data->pg_data_size, 0); reconstruct_stripes_jerasure(stripes, op_data->pg_size, op_data->pg_data_size, entry_attr_size);
} }
for (int role = 0; role < op_data->pg_size; role++) for (int role = 0; role < op_data->pg_size; role++)
{ {

View File

@ -145,6 +145,7 @@ void osd_t::submit_primary_subops(int submit_type, uint64_t op_version, int pg_s
.offset = wr ? stripes[stripe_num].write_start : stripes[stripe_num].read_start, .offset = wr ? stripes[stripe_num].write_start : stripes[stripe_num].read_start,
.len = wr ? stripes[stripe_num].write_end - stripes[stripe_num].write_start : stripes[stripe_num].read_end - stripes[stripe_num].read_start, .len = wr ? stripes[stripe_num].write_end - stripes[stripe_num].write_start : stripes[stripe_num].read_end - stripes[stripe_num].read_start,
.buf = wr ? stripes[stripe_num].write_buf : stripes[stripe_num].read_buf, .buf = wr ? stripes[stripe_num].write_buf : stripes[stripe_num].read_buf,
.bitmap = stripes[stripe_num].bmp_buf,
}); });
#ifdef OSD_DEBUG #ifdef OSD_DEBUG
printf( printf(
@ -159,6 +160,7 @@ void osd_t::submit_primary_subops(int submit_type, uint64_t op_version, int pg_s
{ {
subops[i].op_type = OSD_OP_OUT; subops[i].op_type = OSD_OP_OUT;
subops[i].peer_fd = c_cli.osd_peer_fds.at(role_osd_num); subops[i].peer_fd = c_cli.osd_peer_fds.at(role_osd_num);
subops[i].bitmap = stripes[stripe_num].bmp_buf;
subops[i].req.sec_rw = { subops[i].req.sec_rw = {
.header = { .header = {
.magic = SECONDARY_OSD_OP_MAGIC, .magic = SECONDARY_OSD_OP_MAGIC,
@ -172,6 +174,7 @@ void osd_t::submit_primary_subops(int submit_type, uint64_t op_version, int pg_s
.version = op_version, .version = op_version,
.offset = wr ? stripes[stripe_num].write_start : stripes[stripe_num].read_start, .offset = wr ? stripes[stripe_num].write_start : stripes[stripe_num].read_start,
.len = wr ? stripes[stripe_num].write_end - stripes[stripe_num].write_start : stripes[stripe_num].read_end - stripes[stripe_num].read_start, .len = wr ? stripes[stripe_num].write_end - stripes[stripe_num].write_start : stripes[stripe_num].read_end - stripes[stripe_num].read_start,
.attr_len = entry_attr_size,
}; };
#ifdef OSD_DEBUG #ifdef OSD_DEBUG
printf( printf(

View File

@ -65,6 +65,7 @@ resume_1:
op_data->stripes[0].write_start = op_data->stripes[0].req_start; op_data->stripes[0].write_start = op_data->stripes[0].req_start;
op_data->stripes[0].write_end = op_data->stripes[0].req_end; op_data->stripes[0].write_end = op_data->stripes[0].req_end;
op_data->stripes[0].write_buf = cur_op->buf; op_data->stripes[0].write_buf = cur_op->buf;
op_data->stripes[0].bmp_buf = (void*)(op_data->stripes+1);
if (pg.cur_set.data() != op_data->prev_set && (op_data->stripes[0].write_start != 0 || if (pg.cur_set.data() != op_data->prev_set && (op_data->stripes[0].write_start != 0 ||
op_data->stripes[0].write_end != bs_block_size)) op_data->stripes[0].write_end != bs_block_size))
{ {
@ -77,7 +78,7 @@ resume_1:
else else
{ {
cur_op->rmw_buf = calc_rmw(cur_op->buf, op_data->stripes, op_data->prev_set, cur_op->rmw_buf = calc_rmw(cur_op->buf, op_data->stripes, op_data->prev_set,
pg.pg_size, op_data->pg_data_size, pg.pg_cursize, pg.cur_set.data(), bs_block_size, 0); pg.pg_size, op_data->pg_data_size, pg.pg_cursize, pg.cur_set.data(), bs_block_size, entry_attr_size);
if (!cur_op->rmw_buf) if (!cur_op->rmw_buf)
{ {
// Refuse partial overwrite of an incomplete object // Refuse partial overwrite of an incomplete object
@ -98,7 +99,9 @@ resume_3:
} }
if (op_data->scheme == POOL_SCHEME_REPLICATED) if (op_data->scheme == POOL_SCHEME_REPLICATED)
{ {
// Only (possibly) copy new data from the request into the recovery buffer // Set bitmap bits
bitmap_set(op_data->stripes[0].bmp_buf, op_data->stripes[0].write_start, op_data->stripes[0].write_end, bs_bitmap_granularity);
// Possibly copy new data from the request into the recovery buffer
if (pg.cur_set.data() != op_data->prev_set && (op_data->stripes[0].write_start != 0 || if (pg.cur_set.data() != op_data->prev_set && (op_data->stripes[0].write_start != 0 ||
op_data->stripes[0].write_end != bs_block_size)) op_data->stripes[0].write_end != bs_block_size))
{ {
@ -120,11 +123,11 @@ resume_3:
// Recover missing stripes, calculate parity // Recover missing stripes, calculate parity
if (pg.scheme == POOL_SCHEME_XOR) if (pg.scheme == POOL_SCHEME_XOR)
{ {
calc_rmw_parity_xor(op_data->stripes, pg.pg_size, op_data->prev_set, pg.cur_set.data(), bs_block_size, 0); calc_rmw_parity_xor(op_data->stripes, pg.pg_size, op_data->prev_set, pg.cur_set.data(), bs_block_size, entry_attr_size);
} }
else if (pg.scheme == POOL_SCHEME_JERASURE) else if (pg.scheme == POOL_SCHEME_JERASURE)
{ {
calc_rmw_parity_jerasure(op_data->stripes, pg.pg_size, op_data->pg_data_size, op_data->prev_set, pg.cur_set.data(), bs_block_size, 0); calc_rmw_parity_jerasure(op_data->stripes, pg.pg_size, op_data->pg_data_size, op_data->prev_set, pg.cur_set.data(), bs_block_size, entry_attr_size);
} }
} }
// Send writes // Send writes

View File

@ -332,6 +332,7 @@ void* alloc_read_buffer(osd_rmw_stripe_t *stripes, int read_pg_size, uint64_t ad
} }
} }
// Bitmaps are allocated in the end so data buffers remain aligned // Bitmaps are allocated in the end so data buffers remain aligned
// FIXME: Don't allocate bitmaps here because it probably increases memory fragmentation
if (bitmap_size > 0) if (bitmap_size > 0)
{ {
for (int role = 0; role < read_pg_size; role++) for (int role = 0; role < read_pg_size; role++)

View File

@ -19,11 +19,6 @@ void osd_t::secondary_op_callback(osd_op_t *op)
} }
if (op->req.hdr.opcode == OSD_OP_SEC_READ) if (op->req.hdr.opcode == OSD_OP_SEC_READ)
{ {
if (entry_attr_size > 0)
{
op->reply.sec_rw.attr_len = entry_attr_size;
op->iov.push_back((entry_attr_size > sizeof(void*) ? op->bitmap : &op->bs_op->bitmap), entry_attr_size);
}
if (op->bs_op->retval > 0) if (op->bs_op->retval > 0)
{ {
op->iov.push_back(op->buf, op->bs_op->retval); op->iov.push_back(op->buf, op->bs_op->retval);
@ -65,8 +60,10 @@ void osd_t::exec_secondary(osd_op_t *cur_op)
if (cur_op->req.hdr.opcode == OSD_OP_SEC_READ) if (cur_op->req.hdr.opcode == OSD_OP_SEC_READ)
{ {
// Allocate memory for the read operation // Allocate memory for the read operation
if (entry_attr_size > sizeof(void*)) if (entry_attr_size > sizeof(unsigned))
cur_op->bitmap = cur_op->rmw_buf = malloc_or_die(entry_attr_size); cur_op->bitmap = cur_op->rmw_buf = malloc_or_die(entry_attr_size);
else
cur_op->bitmap = &cur_op->bmp_data;
if (cur_op->req.sec_rw.len > 0) if (cur_op->req.sec_rw.len > 0)
cur_op->buf = memalign_or_die(MEM_ALIGNMENT, cur_op->req.sec_rw.len); cur_op->buf = memalign_or_die(MEM_ALIGNMENT, cur_op->req.sec_rw.len);
} }