Browse Source

Remove cryptic bitmap inlining from bs_op_t and osd_op_t, use bitmap in primary OSD code

rdma-zerocopy
Vitaliy Filippov 7 months ago
parent
commit
004f265393
  1. 7
      src/blockstore.h
  2. 19
      src/blockstore_read.cpp
  3. 8
      src/blockstore_write.cpp
  4. 1
      src/msgr_op.h
  5. 8
      src/msgr_receive.cpp
  6. 2
      src/msgr_send.cpp
  7. 12
      src/osd_primary.cpp
  8. 3
      src/osd_primary_subops.cpp
  9. 11
      src/osd_primary_write.cpp
  10. 1
      src/osd_rmw.cpp
  11. 9
      src/osd_secondary.cpp

7
src/blockstore.h

@ -65,10 +65,9 @@ Input:
- offset, len = offset and length within object. length may be zero, in that case
read operation only returns the version / write operation only bumps the version
- buf = pre-allocated buffer for data (read) / with data (write). may be NULL if len == 0.
- bitmap = <entry_attr_size> bytes long arbitrary data stored for each object in the metadata area.
when <entry_attr_size> fits into pointer size, it should be passed as this field's value.
when it doesn't fit, this field should be a pointer to that piece of data.
named "bitmap" because it's used for the "external bitmap" in Vitastor.
- bitmap = pointer to <entry_attr_size> bytes long (usually very short) arbitrary data
stored for each object in the metadata area.
Called "bitmap" because it's used for the "external bitmap" in Vitastor.
Output:
- retval = number of bytes actually read/written or negative error number (-EINVAL or -ENOSPC)

19
src/blockstore_read.cpp

@ -149,10 +149,11 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
if (!result_version)
{
result_version = dirty_it->first.version;
if (entry_attr_size <= sizeof(void*))
read_op->bitmap = dirty_it->second.bitmap;
else if (read_op->bitmap)
memcpy(read_op->bitmap, dirty_it->second.bitmap, entry_attr_size);
if (read_op->bitmap)
{
void *bmp_ptr = (entry_attr_size > sizeof(void*) ? dirty_it->second.bitmap : &dirty_it->second.bitmap);
memcpy(read_op->bitmap, bmp_ptr, entry_attr_size);
}
}
if (!fulfill_read(read_op, fulfilled, dirty.offset, dirty.offset + dirty.len,
dirty.state, dirty_it->first.version, dirty.location + (IS_JOURNAL(dirty.state) ? 0 : dirty.offset)))
@ -174,11 +175,11 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
if (!result_version)
{
result_version = clean_it->second.version;
void *clean_entry_bitmap = get_clean_entry_bitmap(clean_it->second.location, clean_entry_bitmap_size);
if (entry_attr_size <= sizeof(void*))
memcpy(&read_op->bitmap, clean_entry_bitmap, entry_attr_size);
else if (read_op->bitmap)
memcpy(read_op->bitmap, clean_entry_bitmap, entry_attr_size);
if (read_op->bitmap)
{
void *bmp_ptr = get_clean_entry_bitmap(clean_it->second.location, clean_entry_bitmap_size);
memcpy(read_op->bitmap, bmp_ptr, entry_attr_size);
}
}
if (fulfilled < read_op->len)
{

8
src/blockstore_write.cpp

@ -111,13 +111,9 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
if (op->opcode == BS_OP_WRITE_STABLE)
state |= BS_ST_INSTANT;
if (entry_attr_size > sizeof(void*))
{
bmp = calloc_or_die(1, entry_attr_size);
if (op->bitmap)
memcpy(bmp, op->bitmap, entry_attr_size);
}
else
bmp = op->bitmap;
if (op->bitmap)
memcpy((entry_attr_size > sizeof(void*) ? bmp : &bmp), op->bitmap, entry_attr_size);
}
dirty_db.emplace((obj_ver_id){
.oid = op->oid,

1
src/msgr_op.h

@ -162,6 +162,7 @@ struct osd_op_t
blockstore_op_t *bs_op = NULL;
void *buf = NULL;
void *bitmap = NULL;
unsigned bmp_data = 0;
void *rmw_buf = NULL;
osd_primary_op_data_t* op_data = NULL;
std::function<void(osd_op_t*)> callback;

8
src/msgr_receive.cpp

@ -209,13 +209,11 @@ void osd_messenger_t::handle_op_hdr(osd_client_t *cl)
{
if (cur_op->req.sec_rw.attr_len > 0)
{
if (cur_op->req.sec_rw.attr_len > sizeof(void*))
{
if (cur_op->req.sec_rw.attr_len > sizeof(unsigned))
cur_op->bitmap = cur_op->rmw_buf = malloc_or_die(cur_op->req.sec_rw.attr_len);
cl->recv_list.push_back(cur_op->bitmap, cur_op->req.sec_rw.attr_len);
}
else
cl->recv_list.push_back(&cur_op->bitmap, cur_op->req.sec_rw.attr_len);
cur_op->bitmap = &cur_op->bmp_data;
cl->recv_list.push_back(cur_op->bitmap, cur_op->req.sec_rw.attr_len);
}
if (cur_op->req.sec_rw.len > 0)
{

2
src/msgr_send.cpp

@ -66,7 +66,7 @@ void osd_messenger_t::outbox_push(osd_op_t *cur_op)
cur_op->req.sec_rw.attr_len > 0)
{
to_send_list.push_back((iovec){
.iov_base = (cur_op->reply.sec_rw.attr_len > sizeof(void*) ? cur_op->bitmap : &cur_op->bitmap),
.iov_base = cur_op->bitmap,
.iov_len = cur_op->reply.sec_rw.attr_len,
});
to_outbox.push_back(NULL);

12
src/osd_primary.cpp

@ -2,6 +2,7 @@
// License: VNPL-1.1 (see README.md for details)
#include "osd_primary.h"
#include "allocator.h"
// read: read directly or read paired stripe(s), reconstruct, return
// write: read paired stripe(s), reconstruct, modify, calculate parity, write
@ -51,7 +52,8 @@ bool osd_t::prepare_primary_rw(osd_op_t *cur_op)
return false;
}
osd_primary_op_data_t *op_data = (osd_primary_op_data_t*)calloc_or_die(
1, sizeof(osd_primary_op_data_t) + sizeof(osd_rmw_stripe_t) * (pool_cfg.scheme == POOL_SCHEME_REPLICATED ? 1 : pg_it->second.pg_size)
1, sizeof(osd_primary_op_data_t) + entry_attr_size +
sizeof(osd_rmw_stripe_t) * (pool_cfg.scheme == POOL_SCHEME_REPLICATED ? 1 : pg_it->second.pg_size)
);
op_data->pg_num = pg_num;
op_data->oid = oid;
@ -115,7 +117,7 @@ void osd_t::continue_primary_read(osd_op_t *cur_op)
if (pg.state == PG_ACTIVE || op_data->scheme == POOL_SCHEME_REPLICATED)
{
// Fast happy-path
cur_op->buf = alloc_read_buffer(op_data->stripes, op_data->pg_data_size, 0, 0);
cur_op->buf = alloc_read_buffer(op_data->stripes, op_data->pg_data_size, 0, entry_attr_size);
submit_primary_subops(SUBMIT_READ, op_data->target_ver,
(op_data->scheme == POOL_SCHEME_REPLICATED ? pg.pg_size : op_data->pg_data_size), pg.cur_set.data(), cur_op);
op_data->st = 1;
@ -133,7 +135,7 @@ void osd_t::continue_primary_read(osd_op_t *cur_op)
op_data->pg_size = pg.pg_size;
op_data->scheme = pg.scheme;
op_data->degraded = 1;
cur_op->buf = alloc_read_buffer(op_data->stripes, pg.pg_size, 0, 0);
cur_op->buf = alloc_read_buffer(op_data->stripes, pg.pg_size, 0, entry_attr_size);
submit_primary_subops(SUBMIT_READ, op_data->target_ver, pg.pg_size, cur_set, cur_op);
op_data->st = 1;
}
@ -152,11 +154,11 @@ resume_2:
osd_rmw_stripe_t *stripes = op_data->stripes;
if (op_data->scheme == POOL_SCHEME_XOR)
{
reconstruct_stripes_xor(stripes, op_data->pg_size, 0);
reconstruct_stripes_xor(stripes, op_data->pg_size, entry_attr_size);
}
else if (op_data->scheme == POOL_SCHEME_JERASURE)
{
reconstruct_stripes_jerasure(stripes, op_data->pg_size, op_data->pg_data_size, 0);
reconstruct_stripes_jerasure(stripes, op_data->pg_size, op_data->pg_data_size, entry_attr_size);
}
for (int role = 0; role < op_data->pg_size; role++)
{

3
src/osd_primary_subops.cpp

@ -145,6 +145,7 @@ void osd_t::submit_primary_subops(int submit_type, uint64_t op_version, int pg_s
.offset = wr ? stripes[stripe_num].write_start : stripes[stripe_num].read_start,
.len = wr ? stripes[stripe_num].write_end - stripes[stripe_num].write_start : stripes[stripe_num].read_end - stripes[stripe_num].read_start,
.buf = wr ? stripes[stripe_num].write_buf : stripes[stripe_num].read_buf,
.bitmap = stripes[stripe_num].bmp_buf,
});
#ifdef OSD_DEBUG
printf(
@ -159,6 +160,7 @@ void osd_t::submit_primary_subops(int submit_type, uint64_t op_version, int pg_s
{
subops[i].op_type = OSD_OP_OUT;
subops[i].peer_fd = c_cli.osd_peer_fds.at(role_osd_num);
subops[i].bitmap = stripes[stripe_num].bmp_buf;
subops[i].req.sec_rw = {
.header = {
.magic = SECONDARY_OSD_OP_MAGIC,
@ -172,6 +174,7 @@ void osd_t::submit_primary_subops(int submit_type, uint64_t op_version, int pg_s
.version = op_version,
.offset = wr ? stripes[stripe_num].write_start : stripes[stripe_num].read_start,
.len = wr ? stripes[stripe_num].write_end - stripes[stripe_num].write_start : stripes[stripe_num].read_end - stripes[stripe_num].read_start,
.attr_len = entry_attr_size,
};
#ifdef OSD_DEBUG
printf(

11
src/osd_primary_write.cpp

@ -65,6 +65,7 @@ resume_1:
op_data->stripes[0].write_start = op_data->stripes[0].req_start;
op_data->stripes[0].write_end = op_data->stripes[0].req_end;
op_data->stripes[0].write_buf = cur_op->buf;
op_data->stripes[0].bmp_buf = (void*)(op_data->stripes+1);
if (pg.cur_set.data() != op_data->prev_set && (op_data->stripes[0].write_start != 0 ||
op_data->stripes[0].write_end != bs_block_size))
{
@ -77,7 +78,7 @@ resume_1:
else
{
cur_op->rmw_buf = calc_rmw(cur_op->buf, op_data->stripes, op_data->prev_set,
pg.pg_size, op_data->pg_data_size, pg.pg_cursize, pg.cur_set.data(), bs_block_size, 0);
pg.pg_size, op_data->pg_data_size, pg.pg_cursize, pg.cur_set.data(), bs_block_size, entry_attr_size);
if (!cur_op->rmw_buf)
{
// Refuse partial overwrite of an incomplete object
@ -98,7 +99,9 @@ resume_3:
}
if (op_data->scheme == POOL_SCHEME_REPLICATED)
{
// Only (possibly) copy new data from the request into the recovery buffer
// Set bitmap bits
bitmap_set(op_data->stripes[0].bmp_buf, op_data->stripes[0].write_start, op_data->stripes[0].write_end, bs_bitmap_granularity);
// Possibly copy new data from the request into the recovery buffer
if (pg.cur_set.data() != op_data->prev_set && (op_data->stripes[0].write_start != 0 ||
op_data->stripes[0].write_end != bs_block_size))
{
@ -120,11 +123,11 @@ resume_3:
// Recover missing stripes, calculate parity
if (pg.scheme == POOL_SCHEME_XOR)
{
calc_rmw_parity_xor(op_data->stripes, pg.pg_size, op_data->prev_set, pg.cur_set.data(), bs_block_size, 0);
calc_rmw_parity_xor(op_data->stripes, pg.pg_size, op_data->prev_set, pg.cur_set.data(), bs_block_size, entry_attr_size);
}
else if (pg.scheme == POOL_SCHEME_JERASURE)
{
calc_rmw_parity_jerasure(op_data->stripes, pg.pg_size, op_data->pg_data_size, op_data->prev_set, pg.cur_set.data(), bs_block_size, 0);
calc_rmw_parity_jerasure(op_data->stripes, pg.pg_size, op_data->pg_data_size, op_data->prev_set, pg.cur_set.data(), bs_block_size, entry_attr_size);
}
}
// Send writes

1
src/osd_rmw.cpp

@ -332,6 +332,7 @@ void* alloc_read_buffer(osd_rmw_stripe_t *stripes, int read_pg_size, uint64_t ad
}
}
// Bitmaps are allocated in the end so data buffers remain aligned
// FIXME: Don't allocate bitmaps here because it probably increases memory fragmentation
if (bitmap_size > 0)
{
for (int role = 0; role < read_pg_size; role++)

9
src/osd_secondary.cpp

@ -19,11 +19,6 @@ void osd_t::secondary_op_callback(osd_op_t *op)
}
if (op->req.hdr.opcode == OSD_OP_SEC_READ)
{
if (entry_attr_size > 0)
{
op->reply.sec_rw.attr_len = entry_attr_size;
op->iov.push_back((entry_attr_size > sizeof(void*) ? op->bitmap : &op->bs_op->bitmap), entry_attr_size);
}
if (op->bs_op->retval > 0)
{
op->iov.push_back(op->buf, op->bs_op->retval);
@ -65,8 +60,10 @@ void osd_t::exec_secondary(osd_op_t *cur_op)
if (cur_op->req.hdr.opcode == OSD_OP_SEC_READ)
{
// Allocate memory for the read operation
if (entry_attr_size > sizeof(void*))
if (entry_attr_size > sizeof(unsigned))
cur_op->bitmap = cur_op->rmw_buf = malloc_or_die(entry_attr_size);
else
cur_op->bitmap = &cur_op->bmp_data;
if (cur_op->req.sec_rw.len > 0)
cur_op->buf = memalign_or_die(MEM_ALIGNMENT, cur_op->req.sec_rw.len);
}

Loading…
Cancel
Save