Browse Source

Add "external" bitmap support to the secondary OSD protocol

rdma-zerocopy
Vitaliy Filippov 7 months ago
parent
commit
860ac24762
  1. 1
      src/blockstore.h
  2. 2
      src/blockstore_open.cpp
  3. 2
      src/cluster_client.h
  4. 1
      src/messenger.h
  5. 1
      src/msgr_op.h
  6. 24
      src/msgr_receive.cpp
  7. 12
      src/msgr_send.cpp
  8. 14
      src/osd.cpp
  9. 2
      src/osd.h
  10. 4
      src/osd_ops.h
  11. 22
      src/osd_secondary.cpp

1
src/blockstore.h

@ -27,6 +27,7 @@
#define DEFAULT_ORDER 17
#define MIN_BLOCK_SIZE 4*1024
#define MAX_BLOCK_SIZE 128*1024*1024
#define DEFAULT_BITMAP_GRANULARITY 4096
#define BS_OP_MIN 1
#define BS_OP_READ 1

2
src/blockstore_open.cpp

@ -131,7 +131,7 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config)
}
if (!bitmap_granularity)
{
bitmap_granularity = 4096;
bitmap_granularity = DEFAULT_BITMAP_GRANULARITY;
}
else if (bitmap_granularity % disk_alignment)
{

2
src/cluster_client.h

@ -8,8 +8,6 @@
#define MIN_BLOCK_SIZE 4*1024
#define MAX_BLOCK_SIZE 128*1024*1024
#define DEFAULT_DISK_ALIGNMENT 4096
#define DEFAULT_BITMAP_GRANULARITY 4096
#define DEFAULT_CLIENT_MAX_DIRTY_BYTES 32*1024*1024
#define DEFAULT_CLIENT_MAX_DIRTY_OPS 1024

1
src/messenger.h

@ -31,6 +31,7 @@
#define DEFAULT_PEER_CONNECT_INTERVAL 5
#define DEFAULT_PEER_CONNECT_TIMEOUT 5
#define DEFAULT_OSD_PING_TIMEOUT 5
#define DEFAULT_BITMAP_GRANULARITY 4096
struct osd_client_t
{

1
src/msgr_op.h

@ -161,6 +161,7 @@ struct osd_op_t
osd_any_reply_t reply;
blockstore_op_t *bs_op = NULL;
void *buf = NULL;
void *bitmap = NULL;
void *rmw_buf = NULL;
osd_primary_op_data_t* op_data = NULL;
std::function<void(osd_op_t*)> callback;

24
src/msgr_receive.cpp

@ -202,22 +202,36 @@ void osd_messenger_t::handle_op_hdr(osd_client_t *cl)
osd_op_t *cur_op = cl->read_op;
if (cur_op->req.hdr.opcode == OSD_OP_SEC_READ)
{
if (cur_op->req.sec_rw.len > 0)
cur_op->buf = memalign_or_die(MEM_ALIGNMENT, cur_op->req.sec_rw.len);
cl->read_remaining = 0;
}
else if (cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE ||
cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE_STABLE)
{
if (cur_op->req.sec_rw.attr_len > 0)
{
if (cur_op->req.sec_rw.attr_len > sizeof(void*))
{
cur_op->bitmap = cur_op->rmw_buf = malloc_or_die(cur_op->req.sec_rw.attr_len);
cl->recv_list.push_back(cur_op->bitmap, cur_op->req.sec_rw.attr_len);
}
else
cl->recv_list.push_back(&cur_op->bitmap, cur_op->req.sec_rw.attr_len);
}
if (cur_op->req.sec_rw.len > 0)
{
cur_op->buf = memalign_or_die(MEM_ALIGNMENT, cur_op->req.sec_rw.len);
cl->read_remaining = cur_op->req.sec_rw.len;
cl->recv_list.push_back(cur_op->buf, cur_op->req.sec_rw.len);
}
cl->read_remaining = cur_op->req.sec_rw.len + cur_op->req.sec_rw.attr_len;
}
else if (cur_op->req.hdr.opcode == OSD_OP_SEC_STABILIZE ||
cur_op->req.hdr.opcode == OSD_OP_SEC_ROLLBACK)
{
if (cur_op->req.sec_stab.len > 0)
{
cur_op->buf = memalign_or_die(MEM_ALIGNMENT, cur_op->req.sec_stab.len);
cl->recv_list.push_back(cur_op->buf, cur_op->req.sec_stab.len);
}
cl->read_remaining = cur_op->req.sec_stab.len;
}
else if (cur_op->req.hdr.opcode == OSD_OP_READ)
@ -227,13 +241,15 @@ void osd_messenger_t::handle_op_hdr(osd_client_t *cl)
else if (cur_op->req.hdr.opcode == OSD_OP_WRITE)
{
if (cur_op->req.rw.len > 0)
{
cur_op->buf = memalign_or_die(MEM_ALIGNMENT, cur_op->req.rw.len);
cl->recv_list.push_back(cur_op->buf, cur_op->req.rw.len);
}
cl->read_remaining = cur_op->req.rw.len;
}
if (cl->read_remaining > 0)
{
// Read data
cl->recv_list.push_back(cur_op->buf, cl->read_remaining);
cl->read_state = CL_READ_DATA;
}
else

12
src/msgr_send.cpp

@ -59,6 +59,18 @@ void osd_messenger_t::outbox_push(osd_op_t *cur_op)
cur_op->req.hdr.opcode == OSD_OP_SEC_STABILIZE ||
cur_op->req.hdr.opcode == OSD_OP_SEC_ROLLBACK)) && cur_op->iov.count > 0)
{
to_outbox.push_back(NULL);
// Bitmap
if (cur_op->req.hdr.opcode == OSD_OP_SEC_READ && cur_op->reply.sec_rw.attr_len > 0 ||
(cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE || cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE_STABLE) &&
cur_op->req.sec_rw.attr_len > 0)
{
to_send_list.push_back((iovec){
.iov_base = (cur_op->reply.sec_rw.attr_len > sizeof(void*) ? cur_op->bitmap : &cur_op->bitmap),
.iov_len = cur_op->reply.sec_rw.attr_len,
});
to_outbox.push_back(NULL);
}
for (int i = 0; i < cur_op->iov.count; i++)
{
assert(cur_op->iov.buf[i].iov_base);

14
src/osd.cpp

@ -12,7 +12,16 @@
osd_t::osd_t(blockstore_config_t & config, ring_loop_t *ringloop)
{
config["entry_attr_size"] = "0";
bs_block_size = strtoull(config["block_size"].c_str(), NULL, 10);
bs_bitmap_granularity = strtoull(config["bitmap_granularity"].c_str(), NULL, 10);
if (!bs_block_size)
bs_block_size = DEFAULT_BLOCK_SIZE;
if (!bs_bitmap_granularity)
bs_bitmap_granularity = DEFAULT_BITMAP_GRANULARITY;
// Force external bitmap size
entry_attr_size = bs_block_size / bs_bitmap_granularity / 8;
config["entry_attr_size"] = entry_attr_size;
this->config = config;
this->ringloop = ringloop;
@ -20,9 +29,6 @@ osd_t::osd_t(blockstore_config_t & config, ring_loop_t *ringloop)
// FIXME: Create Blockstore from on-disk superblock config and check it against the OSD cluster config
this->bs = new blockstore_t(config, ringloop);
this->bs_block_size = bs->get_block_size();
this->bs_bitmap_granularity = bs->get_bitmap_granularity();
parse_config(config);
epmgr = new epoll_manager_t(ringloop);

2
src/osd.h

@ -115,7 +115,7 @@ class osd_t
bool stopping = false;
int inflight_ops = 0;
blockstore_t *bs;
uint32_t bs_block_size, bs_bitmap_granularity;
uint32_t bs_block_size, bs_bitmap_granularity, entry_attr_size;
ring_loop_t *ringloop;
timerfd_manager_t *tfd = NULL;
epoll_manager_t *epmgr = NULL;

4
src/osd_ops.h

@ -71,6 +71,8 @@ struct __attribute__((__packed__)) osd_op_secondary_rw_t
uint32_t offset;
// length
uint32_t len;
// bitmap/attribute length - bitmap comes after header, but before data
uint32_t attr_len;
};
struct __attribute__((__packed__)) osd_reply_secondary_rw_t
@ -78,6 +80,8 @@ struct __attribute__((__packed__)) osd_reply_secondary_rw_t
osd_reply_header_t header;
// for reads and writes: assigned or read version number
uint64_t version;
// for reads: bitmap/attribute length (just to double-check)
uint32_t attr_len;
};
// delete object on the secondary OSD

22
src/osd_secondary.cpp

@ -17,10 +17,17 @@ void osd_t::secondary_op_callback(osd_op_t *op)
{
op->reply.sec_del.version = op->bs_op->version;
}
if (op->req.hdr.opcode == OSD_OP_SEC_READ &&
op->bs_op->retval > 0)
if (op->req.hdr.opcode == OSD_OP_SEC_READ)
{
op->iov.push_back(op->buf, op->bs_op->retval);
if (entry_attr_size > 0)
{
op->reply.sec_rw.attr_len = entry_attr_size;
op->iov.push_back((entry_attr_size > sizeof(void*) ? op->bitmap : &op->bs_op->bitmap), entry_attr_size);
}
if (op->bs_op->retval > 0)
{
op->iov.push_back(op->buf, op->bs_op->retval);
}
}
else if (op->req.hdr.opcode == OSD_OP_SEC_LIST)
{
@ -55,11 +62,20 @@ void osd_t::exec_secondary(osd_op_t *cur_op)
cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE ||
cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE_STABLE)
{
if (cur_op->req.hdr.opcode == OSD_OP_SEC_READ)
{
// Allocate memory for the read operation
if (entry_attr_size > sizeof(void*))
cur_op->bitmap = cur_op->rmw_buf = malloc_or_die(entry_attr_size);
if (cur_op->req.sec_rw.len > 0)
cur_op->buf = memalign_or_die(MEM_ALIGNMENT, cur_op->req.sec_rw.len);
}
cur_op->bs_op->oid = cur_op->req.sec_rw.oid;
cur_op->bs_op->version = cur_op->req.sec_rw.version;
cur_op->bs_op->offset = cur_op->req.sec_rw.offset;
cur_op->bs_op->len = cur_op->req.sec_rw.len;
cur_op->bs_op->buf = cur_op->buf;
cur_op->bs_op->bitmap = cur_op->bitmap;
#ifdef OSD_STUB
cur_op->bs_op->retval = cur_op->bs_op->len;
#endif

Loading…
Cancel
Save