Actual snapshot support (untested)

rdma-zerocopy
Vitaliy Filippov 2021-01-30 01:35:58 +03:00
부모 ffe1cd4c79
커밋 691f066055
7개의 변경된 파일274개의 추가작업 그리고 38개의 파일을 삭제

파일 보기

@ -24,6 +24,7 @@ const etcd_allow = new RegExp('^'+[
'config/pools',
'config/osd/[1-9]\\d*',
'config/pgs',
'config/inode/[1-9]\\d*/[1-9]\\d*',
'osd/state/[1-9]\\d*',
'osd/stats/[1-9]\\d*',
'osd/inodestats/[1-9]\\d*',
@ -144,6 +145,17 @@ const etcd_tree = {
}
}, */
pgs: {},
/* inode: {
<pool_id>: {
<inode_t>: {
name: string,
parent_pool?: <pool_id>,
parent_id?: <inode_t>,
readonly?: boolean,
}
}
}, */
inode: {},
},
osd: {
state: {

파일 보기

@ -5,6 +5,7 @@
#include <assert.h>
#include "cluster_client.h"
#define SCRAP_BUFFER_SIZE 4*1024*1024
#define PART_SENT 1
#define PART_DONE 2
#define PART_ERROR 4
@ -63,10 +64,8 @@ cluster_client_t::cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd
st_cli.parse_config(config);
st_cli.load_global_config();
// Temporary implementation: discard all bitmaps
// It will be of course replaced by the implementation of snapshots
scrap_bitmap_size = 4096;
scrap_bitmap = malloc_or_die(scrap_bitmap_size);
scrap_buffer_size = SCRAP_BUFFER_SIZE;
scrap_buffer = malloc_or_die(scrap_buffer_size);
if (ringloop)
{
@ -91,7 +90,22 @@ cluster_client_t::~cluster_client_t()
{
ringloop->unregister_consumer(&consumer);
}
free(scrap_bitmap);
free(scrap_buffer);
}
cluster_op_t::~cluster_op_t()
{
if (buf)
{
free(buf);
buf = NULL;
}
if (bitmap_buf)
{
free(bitmap_buf);
part_bitmaps = NULL;
bitmap_buf = NULL;
}
}
void cluster_client_t::continue_ops(bool up_retry)
@ -193,6 +207,7 @@ void cluster_client_t::on_load_config_hook(json11::Json::object & config)
{
bs_bitmap_granularity = DEFAULT_BITMAP_GRANULARITY;
}
bs_bitmap_size = bs_block_size / bs_bitmap_granularity / 8;
uint32_t block_order;
if ((block_order = is_power_of_two(bs_block_size)) >= 64 || bs_block_size < MIN_BLOCK_SIZE || bs_block_size >= MAX_BLOCK_SIZE)
{
@ -269,7 +284,7 @@ void cluster_client_t::on_change_hook(json11::Json::object & changes)
for (auto op: op_queue)
{
if ((op->opcode == OSD_OP_WRITE || op->opcode == OSD_OP_READ) &&
INODE_POOL(op->inode) == pool_item.first)
INODE_POOL(op->cur_inode) == pool_item.first)
{
op->needs_reslice = true;
}
@ -334,6 +349,7 @@ void cluster_client_t::execute(cluster_op_t *op)
std::function<void(cluster_op_t*)>(op->callback)(op);
return;
}
op->cur_inode = op->inode;
op->retval = 0;
if (op->opcode == OSD_OP_WRITE && !immediate_commit)
{
@ -446,7 +462,7 @@ void cluster_client_t::flush_buffer(const object_id & oid, cluster_buffer_t *wr)
cluster_op_t *op = new cluster_op_t;
op->flags = OP_FLUSH_BUFFER;
op->opcode = OSD_OP_WRITE;
op->inode = oid.inode;
op->cur_inode = op->inode = oid.inode;
op->offset = oid.stripe;
op->len = wr->len;
op->iov.push_back(wr->buf, wr->len);
@ -484,7 +500,7 @@ resume_0:
return 1;
}
{
pool_id_t pool_id = INODE_POOL(op->inode);
pool_id_t pool_id = INODE_POOL(op->cur_inode);
if (!pool_id)
{
op->retval = -EINVAL;
@ -500,6 +516,13 @@ resume_0:
}
if (op->opcode == OSD_OP_WRITE)
{
auto ino_it = st_cli.inode_config.find(op->inode);
if (ino_it != st_cli.inode_config.end() && ino_it->second.readonly)
{
op->retval = -EINVAL;
std::function<void(cluster_op_t*)>(op->callback)(op);
return 1;
}
if (!immediate_commit && !(op->flags & OP_FLUSH_BUFFER))
{
copy_write(op, dirty_buffers);
@ -558,6 +581,22 @@ resume_3:
// Finished successfully
// Even if the PG count has changed in meanwhile we treat it as success
// because if some operations were invalid for the new PG count we'd get errors
bool is_read = op->opcode == OSD_OP_READ;
if (is_read)
{
// Check parent inode
auto ino_it = st_cli.inode_config.find(op->cur_inode);
if (ino_it != st_cli.inode_config.end() &&
ino_it->second.parent_id)
{
// Continue reading from the parent inode
// FIXME: This obviously requires optimizations for long snapshot chains
op->cur_inode = ino_it->second.parent_id;
op->parts.clear();
op->done_count = 0;
goto resume_1;
}
}
op->retval = op->len;
std::function<void(cluster_op_t*)>(op->callback)(op);
return 1;
@ -590,51 +629,130 @@ resume_3:
return 0;
}
static void add_iov(int size, bool skip, cluster_op_t *op, int &iov_idx, size_t &iov_pos, osd_op_buf_list_t &iov, void *scrap, int scrap_len)
{
int left = size;
while (left > 0 && iov_idx < op->iov.count)
{
int cur_left = op->iov.buf[iov_idx].iov_len - iov_pos;
if (cur_left < left)
{
if (!skip)
{
iov.push_back(op->iov.buf[iov_idx].iov_base + iov_pos, cur_left);
}
left -= cur_left;
iov_pos = 0;
iov_idx++;
}
else
{
if (!skip)
{
iov.push_back(op->iov.buf[iov_idx].iov_base + iov_pos, left);
}
iov_pos += left;
left = 0;
}
}
assert(left == 0);
if (skip && scrap_len > 0)
{
// All skipped ranges are read into the same useless buffer
left = size;
while (left > 0)
{
int cur_left = scrap_len < left ? scrap_len : left;
iov.push_back(scrap, cur_left);
left -= cur_left;
}
}
}
void cluster_client_t::slice_rw(cluster_op_t *op)
{
// Slice the request into individual object stripe requests
// Primary OSDs still operate individual stripes, but their size is multiplied by PG minsize in case of EC
auto & pool_cfg = st_cli.pool_config.at(INODE_POOL(op->inode));
auto & pool_cfg = st_cli.pool_config.at(INODE_POOL(op->cur_inode));
uint32_t pg_data_size = (pool_cfg.scheme == POOL_SCHEME_REPLICATED ? 1 : pool_cfg.pg_size-pool_cfg.parity_chunks);
uint64_t pg_block_size = bs_block_size * pg_data_size;
uint64_t first_stripe = (op->offset / pg_block_size) * pg_block_size;
uint64_t last_stripe = ((op->offset + op->len + pg_block_size - 1) / pg_block_size - 1) * pg_block_size;
op->retval = 0;
op->parts.resize((last_stripe - first_stripe) / pg_block_size + 1);
if (op->opcode == OSD_OP_READ)
{
// Allocate memory for the bitmap
unsigned object_bitmap_size = ((op->len / bs_bitmap_granularity + 7) / 8);
object_bitmap_size = (object_bitmap_size < 8 ? 8 : object_bitmap_size);
unsigned bitmap_mem = object_bitmap_size + (bs_bitmap_size * pg_data_size) * op->parts.size();
if (op->bitmap_buf_size < bitmap_mem)
{
op->bitmap_buf = realloc_or_die(op->bitmap_buf, bitmap_mem);
if (!op->bitmap_buf_size)
{
// First allocation
memset(op->bitmap_buf, 0, object_bitmap_size);
}
op->part_bitmaps = op->bitmap_buf + object_bitmap_size;
op->bitmap_buf_size = bitmap_mem;
}
}
int iov_idx = 0;
size_t iov_pos = 0;
int i = 0;
for (uint64_t stripe = first_stripe; stripe <= last_stripe; stripe += pg_block_size)
{
pg_num_t pg_num = (op->inode + stripe/pool_cfg.pg_stripe_size) % pool_cfg.real_pg_count + 1;
pg_num_t pg_num = (op->cur_inode + stripe/pool_cfg.pg_stripe_size) % pool_cfg.real_pg_count + 1; // like map_to_pg()
uint64_t begin = (op->offset < stripe ? stripe : op->offset);
uint64_t end = (op->offset + op->len) > (stripe + pg_block_size)
? (stripe + pg_block_size) : (op->offset + op->len);
op->parts[i] = (cluster_op_part_t){
.parent = op,
.offset = begin,
.len = (uint32_t)(end - begin),
.pg_num = pg_num,
.flags = 0,
};
int left = end-begin;
while (left > 0 && iov_idx < op->iov.count)
op->parts[i].iov.reset();
if (op->cur_inode != op->inode)
{
if (op->iov.buf[iov_idx].iov_len - iov_pos < left)
// Read remaining parts from upper layers
uint64_t prev = begin, cur = begin;
bool skip_prev = true;
while (cur < end)
{
op->parts[i].iov.push_back(op->iov.buf[iov_idx].iov_base + iov_pos, op->iov.buf[iov_idx].iov_len - iov_pos);
left -= (op->iov.buf[iov_idx].iov_len - iov_pos);
iov_pos = 0;
iov_idx++;
unsigned bmp_loc = (cur - op->offset)/bs_bitmap_granularity;
bool skip = (((*(uint8_t*)(op->bitmap_buf + bmp_loc/8)) >> (bmp_loc%8)) & 0x1);
if (skip_prev != skip)
{
if (cur > prev)
{
if (prev == begin && skip_prev)
{
begin = cur;
// Just advance iov_idx & iov_pos
add_iov(cur-prev, true, op, iov_idx, iov_pos, op->parts[i].iov, NULL, 0);
}
else
add_iov(cur-prev, skip_prev, op, iov_idx, iov_pos, op->parts[i].iov, scrap_buffer, scrap_buffer_size);
}
skip_prev = skip;
prev = cur;
}
cur += bs_bitmap_granularity;
}
assert(cur > prev);
if (skip_prev)
end = prev;
else
{
op->parts[i].iov.push_back(op->iov.buf[iov_idx].iov_base + iov_pos, left);
iov_pos += left;
left = 0;
}
add_iov(cur-prev, skip_prev, op, iov_idx, iov_pos, op->parts[i].iov, scrap_buffer, scrap_buffer_size);
if (end == begin)
op->done_count++;
}
assert(left == 0);
else
{
add_iov(end-begin, false, op, iov_idx, iov_pos, op->parts[i].iov, NULL, 0);
}
op->parts[i].parent = op;
op->parts[i].offset = begin;
op->parts[i].len = (uint32_t)(end - begin);
op->parts[i].pg_num = pg_num;
op->parts[i].osd_num = 0;
op->parts[i].flags = 0;
i++;
}
}
@ -661,7 +779,7 @@ bool cluster_client_t::affects_osd(uint64_t inode, uint64_t offset, uint64_t len
bool cluster_client_t::try_send(cluster_op_t *op, int i)
{
auto part = &op->parts[i];
auto & pool_cfg = st_cli.pool_config[INODE_POOL(op->inode)];
auto & pool_cfg = st_cli.pool_config.at(INODE_POOL(op->cur_inode));
auto pg_it = pool_cfg.pg_config.find(part->pg_num);
if (pg_it != pool_cfg.pg_config.end() &&
!pg_it->second.pause && pg_it->second.cur_primary)
@ -674,6 +792,9 @@ bool cluster_client_t::try_send(cluster_op_t *op, int i)
part->osd_num = primary_osd;
part->flags |= PART_SENT;
op->inflight_count++;
uint64_t pg_bitmap_size = bs_bitmap_size * (
pool_cfg.scheme == POOL_SCHEME_REPLICATED ? 1 : pool_cfg.pg_size-pool_cfg.parity_chunks
);
part->op = (osd_op_t){
.op_type = OSD_OP_OUT,
.peer_fd = peer_fd,
@ -683,12 +804,12 @@ bool cluster_client_t::try_send(cluster_op_t *op, int i)
.id = op_id++,
.opcode = op->opcode,
},
.inode = op->inode,
.inode = op->cur_inode,
.offset = part->offset,
.len = part->len,
} },
.bitmap = scrap_bitmap,
.bitmap_len = scrap_bitmap_size,
.bitmap = op->opcode == OSD_OP_WRITE ? NULL : op->part_bitmaps + pg_bitmap_size*i,
.bitmap_len = (unsigned)(op->opcode == OSD_OP_WRITE ? 0 : pg_bitmap_size),
.callback = [this, part](osd_op_t *op_part)
{
handle_op_part(part);
@ -817,6 +938,16 @@ void cluster_client_t::send_sync(cluster_op_t *op, cluster_op_part_t *part)
msgr.outbox_push(&part->op);
}
static inline void mem_or(void *res, const void *r2, unsigned int len)
{
unsigned int i;
for (i = 0; i < len; ++i)
{
// Hope the compiler vectorizes this
((uint8_t*)res)[i] = ((uint8_t*)res)[i] | ((uint8_t*)r2)[i];
}
}
void cluster_client_t::handle_op_part(cluster_op_part_t *part)
{
cluster_op_t *op = part->parent;
@ -856,9 +987,43 @@ void cluster_client_t::handle_op_part(cluster_op_part_t *part)
dirty_osds.insert(part->osd_num);
part->flags |= PART_DONE;
op->done_count++;
if (op->opcode == OSD_OP_READ)
{
copy_part_bitmap(op, part);
}
}
if (op->inflight_count == 0)
{
continue_ops();
}
}
void cluster_client_t::copy_part_bitmap(cluster_op_t *op, cluster_op_part_t *part)
{
// Copy (OR) bitmap
auto & pool_cfg = st_cli.pool_config.at(INODE_POOL(op->cur_inode));
uint32_t pg_block_size = bs_block_size * (
pool_cfg.scheme == POOL_SCHEME_REPLICATED ? 1 : pool_cfg.pg_size-pool_cfg.parity_chunks
);
uint32_t object_offset = (part->op.req.rw.offset - op->offset) / bs_bitmap_granularity;
uint32_t part_offset = (part->op.req.rw.offset % pg_block_size) / bs_bitmap_granularity;
uint32_t part_len = part->op.req.rw.len / bs_bitmap_granularity;
if (!(object_offset & 0x7) && !(part_offset & 0x7) && (part_len >= 8))
{
// Copy bytes
mem_or(op->bitmap_buf + object_offset/8, part->op.bitmap + part_offset/8, part_len/8);
object_offset += (part_len & ~0x7);
part_offset += (part_len & ~0x7);
part_len = (part_len & 0x7);
}
while (part_len > 0)
{
// Copy bits
(*(uint8_t*)(op->bitmap_buf + (object_offset >> 3))) |= (
(((*(uint8_t*)(part->op.bitmap + (part_offset >> 3))) >> (part_offset & 0x7)) & 0x1) << (object_offset & 0x7)
);
part_offset++;
object_offset++;
part_len--;
}
}

파일 보기

@ -34,15 +34,19 @@ struct cluster_op_t
int retval;
osd_op_buf_list_t iov;
std::function<void(cluster_op_t*)> callback;
~cluster_op_t();
protected:
int flags = 0;
int state = 0;
uint64_t cur_inode; // for snapshot reads
void *buf = NULL;
cluster_op_t *orig_op = NULL;
bool needs_reslice = false;
bool up_wait = false;
int inflight_count = 0, done_count = 0;
std::vector<cluster_op_part_t> parts;
void *bitmap_buf = NULL, *part_bitmaps = NULL;
unsigned bitmap_buf_size = 0;
friend class cluster_client_t;
};
@ -60,7 +64,7 @@ class cluster_client_t
ring_loop_t *ringloop;
uint64_t bs_block_size = 0;
uint64_t bs_bitmap_granularity = 0;
uint32_t bs_bitmap_granularity = 0, bs_bitmap_size = 0;
std::map<pool_id_t, uint64_t> pg_counts;
bool immediate_commit = false;
// FIXME: Implement inmemory_commit mode. Note that it requires to return overlapping reads from memory.
@ -77,8 +81,8 @@ class cluster_client_t
std::set<osd_num_t> dirty_osds;
uint64_t dirty_bytes = 0, dirty_ops = 0;
void *scrap_bitmap = NULL;
unsigned scrap_bitmap_size = 0;
void *scrap_buffer = NULL;
unsigned scrap_buffer_size = 0;
bool pgs_loaded = false;
ring_consumer_t consumer;
@ -112,4 +116,5 @@ protected:
int continue_sync(cluster_op_t *op);
void send_sync(cluster_op_t *op, cluster_op_part_t *part);
void handle_op_part(cluster_op_part_t *part);
void copy_part_bitmap(cluster_op_t *op, cluster_op_part_t *part);
};

파일 보기

@ -642,4 +642,47 @@ void etcd_state_client_t::parse_state(const std::string & key, const json11::Jso
}
}
}
else if (key.substr(0, etcd_prefix.length()+14) == etcd_prefix+"/config/inode/")
{
// <etcd_prefix>/config/inode/%d/%d
uint64_t pool_id = 0;
uint64_t inode_num = 0;
char null_byte = 0;
sscanf(key.c_str() + etcd_prefix.length()+14, "%lu/%lu%c", &pool_id, &inode_num, &null_byte);
if (!pool_id || pool_id >= POOL_ID_MAX || !inode_num || (inode_num >> (64-POOL_ID_BITS)) || null_byte != 0)
{
printf("Bad etcd key %s, ignoring\n", key.c_str());
}
else
{
inode_num |= (pool_id << (64-POOL_ID_BITS));
if (!value.is_object())
{
this->inode_config.erase(inode_num);
}
else
{
inode_t parent_inode_num = value["parent_id"].uint64_value();
if (parent_inode_num && !(parent_inode_num >> (64-POOL_ID_BITS)))
{
uint64_t parent_pool_id = value["parent_pool"].uint64_value();
if (parent_pool_id >= POOL_ID_MAX)
{
printf(
"Inode %lu/%lu parent_pool value is invalid, ignoring parent setting\n",
inode_num >> (64-POOL_ID_BITS), inode_num & ((1l << (64-POOL_ID_BITS)) - 1)
);
parent_inode_num = 0;
}
else
parent_inode_num |= parent_pool_id << (64-POOL_ID_BITS);
}
this->inode_config[inode_num] = (inode_config_t){
.name = value["name"].string_value(),
.parent_id = parent_inode_num,
.readonly = value["readonly"].bool_value(),
};
}
}
}
}

파일 보기

@ -52,6 +52,13 @@ struct pool_config_t
std::map<pg_num_t, pg_config_t> pg_config;
};
struct inode_config_t
{
std::string name;
inode_t parent_id;
bool readonly;
};
struct websocket_t;
struct etcd_state_client_t
@ -70,6 +77,7 @@ public:
uint64_t etcd_watch_revision = 0;
std::map<pool_id_t, pool_config_t> pool_config;
std::map<osd_num_t, json11::Json> peer_states;
std::map<inode_t, inode_config_t> inode_config;
std::function<void(json11::Json::object &)> on_change_hook;
std::function<void(json11::Json::object &)> on_load_config_hook;

파일 보기

@ -161,6 +161,7 @@ struct osd_op_t
osd_any_reply_t reply;
blockstore_op_t *bs_op = NULL;
void *buf = NULL;
// bitmap, bitmap_len, bmp_data are only meaningful for reads
void *bitmap = NULL;
unsigned bitmap_len = 0;
unsigned bmp_data = 0;

파일 보기

@ -6,12 +6,14 @@
#include <stdint.h>
#include <functional>
typedef uint64_t inode_t;
// 16 bytes per object/stripe id
// stripe = (start of the parity stripe + peer role)
// i.e. for example (256KB + one of 0,1,2)
struct __attribute__((__packed__)) object_id
{
uint64_t inode;
inode_t inode;
uint64_t stripe;
};