Use clean_entry_bitmap_size instead of entry_attr_size back because of changed bitmap handling

rdma-zerocopy
Vitaliy Filippov 2021-02-07 16:26:08 +03:00
parent d0c2e31312
commit ab39ce2bbb
14 changed files with 54 additions and 64 deletions

View File

@ -65,9 +65,8 @@ Input:
- offset, len = offset and length within object. length may be zero, in that case
read operation only returns the version / write operation only bumps the version
- buf = pre-allocated buffer for data (read) / with data (write). may be NULL if len == 0.
- bitmap = pointer to <entry_attr_size> bytes long (usually very short) arbitrary data
stored for each object in the metadata area.
Called "bitmap" because it's used for the "external bitmap" in Vitastor.
- bitmap = pointer to the new 'external' object bitmap data. Its part which is respective to the
write request is copied into the metadata area bitwise and stored there.
Output:
- retval = number of bytes actually read/written or negative error number (-EINVAL or -ENOSPC)

View File

@ -428,7 +428,7 @@ resume_1:
{
new_clean_bitmap = (bs->inmemory_meta
? meta_new.buf + meta_new.pos*bs->clean_entry_size + sizeof(clean_disk_entry)
: bs->clean_bitmap + (clean_loc >> bs->block_order)*(bs->clean_entry_bitmap_size + bs->entry_attr_size));
: bs->clean_bitmap + (clean_loc >> bs->block_order)*(2*bs->clean_entry_bitmap_size));
if (clean_init_bitmap)
{
memset(new_clean_bitmap, 0, bs->clean_entry_bitmap_size);
@ -510,11 +510,11 @@ resume_1:
{
memcpy(&new_entry->bitmap, new_clean_bitmap, bs->clean_entry_bitmap_size);
}
if (bs->entry_attr_size)
{
// copy latest external bitmap/attributes
void *bmp_ptr = bs->entry_attr_size > sizeof(void*) ? dirty_end->second.bitmap : &dirty_end->second.bitmap;
memcpy((void*)(new_entry+1) + bs->clean_entry_bitmap_size, bmp_ptr, bs->entry_attr_size);
if (bs->clean_entry_bitmap_size)
{
void *bmp_ptr = bs->clean_entry_bitmap_size > sizeof(void*) ? dirty_end->second.bitmap : &dirty_end->second.bitmap;
memcpy((void*)(new_entry+1) + bs->clean_entry_bitmap_size, bmp_ptr, bs->clean_entry_bitmap_size);
}
}
await_sqe(6);

View File

@ -218,7 +218,7 @@ class blockstore_impl_t
uint32_t block_order;
uint64_t block_count;
uint32_t clean_entry_bitmap_size = 0, clean_entry_size = 0, entry_attr_size = 0;
uint32_t clean_entry_bitmap_size = 0, clean_entry_size = 0;
int meta_fd;
int data_fd;

View File

@ -98,9 +98,9 @@ void blockstore_init_meta::handle_entries(void* entries, unsigned count, int blo
for (unsigned i = 0; i < count; i++)
{
clean_disk_entry *entry = (clean_disk_entry*)(entries + i*bs->clean_entry_size);
if (!bs->inmemory_meta && (bs->clean_entry_bitmap_size || bs->entry_attr_size))
if (!bs->inmemory_meta && bs->clean_entry_bitmap_size)
{
memcpy(bs->clean_bitmap + (done_cnt+i)*(bs->clean_entry_bitmap_size + bs->entry_attr_size), &entry->bitmap, (bs->clean_entry_bitmap_size + bs->entry_attr_size));
memcpy(bs->clean_bitmap + (done_cnt+i)*2*bs->clean_entry_bitmap_size, &entry->bitmap, 2*bs->clean_entry_bitmap_size);
}
if (entry->oid.inode > 0)
{
@ -550,9 +550,9 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
.version = je->small_write.version,
};
void *bmp = (void*)je + sizeof(journal_entry_small_write);
if (bs->entry_attr_size <= sizeof(void*))
if (bs->clean_entry_bitmap_size <= sizeof(void*))
{
memcpy(&bmp, bmp, bs->entry_attr_size);
memcpy(&bmp, bmp, bs->clean_entry_bitmap_size);
}
else if (!bs->journal.inmemory)
{
@ -560,8 +560,8 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
// will result in a lot of small allocations for entry bitmaps. This can
// only be fixed by using a patched map with dynamic entry size, but not
// the btree_map, because it doesn't keep iterators valid all the time.
void *bmp_cp = malloc_or_die(bs->entry_attr_size);
memcpy(bmp_cp, bmp, bs->entry_attr_size);
void *bmp_cp = malloc_or_die(bs->clean_entry_bitmap_size);
memcpy(bmp_cp, bmp, bs->clean_entry_bitmap_size);
bmp = bmp_cp;
}
bs->dirty_db.emplace(ov, (dirty_entry){
@ -630,9 +630,9 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
.version = je->big_write.version,
};
void *bmp = (void*)je + sizeof(journal_entry_big_write);
if (bs->entry_attr_size <= sizeof(void*))
if (bs->clean_entry_bitmap_size <= sizeof(void*))
{
memcpy(&bmp, bmp, bs->entry_attr_size);
memcpy(&bmp, bmp, bs->clean_entry_bitmap_size);
}
else if (!bs->journal.inmemory)
{
@ -640,8 +640,8 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
// will result in a lot of small allocations for entry bitmaps. This can
// only be fixed by using a patched map with dynamic entry size, but not
// the btree_map, because it doesn't keep iterators valid all the time.
void *bmp_cp = malloc_or_die(bs->entry_attr_size);
memcpy(bmp_cp, bmp, bs->entry_attr_size);
void *bmp_cp = malloc_or_die(bs->clean_entry_bitmap_size);
memcpy(bmp_cp, bmp, bs->clean_entry_bitmap_size);
bmp = bmp_cp;
}
auto dirty_it = bs->dirty_db.emplace(ov, (dirty_entry){

View File

@ -62,7 +62,6 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config)
cfg_data_size = strtoull(config["data_size"].c_str(), NULL, 10);
meta_device = config["meta_device"];
meta_offset = strtoull(config["meta_offset"].c_str(), NULL, 10);
entry_attr_size = strtoull(config["entry_attr_size"].c_str(), NULL, 10);
block_size = strtoull(config["block_size"].c_str(), NULL, 10);
inmemory_meta = config["inmemory_metadata"] != "false";
journal_device = config["journal_device"];
@ -153,11 +152,6 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config)
{
throw std::runtime_error("meta_offset must be a multiple of meta_block_size = "+std::to_string(meta_block_size));
}
// FIXME: Due to the recent changes in entry_attr handling rename it back to bitmap
if (entry_attr_size > meta_block_size/2)
{
throw std::runtime_error("entry_attr_size is too big");
}
if (journal.offset % journal_block_size)
{
throw std::runtime_error("journal_offset must be a multiple of journal_block_size = "+std::to_string(journal_block_size));
@ -188,7 +182,7 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config)
}
// init some fields
clean_entry_bitmap_size = block_size / bitmap_granularity / 8;
clean_entry_size = sizeof(clean_disk_entry) + clean_entry_bitmap_size + entry_attr_size;
clean_entry_size = sizeof(clean_disk_entry) + 2*clean_entry_bitmap_size;
journal.block_size = journal_block_size;
journal.next_free = journal_block_size;
journal.used_start = journal_block_size;
@ -253,9 +247,9 @@ void blockstore_impl_t::calc_lengths()
if (!metadata_buffer)
throw std::runtime_error("Failed to allocate memory for the metadata");
}
else if (clean_entry_bitmap_size || entry_attr_size)
else if (clean_entry_bitmap_size)
{
clean_bitmap = (uint8_t*)malloc(block_count * (clean_entry_bitmap_size + entry_attr_size));
clean_bitmap = (uint8_t*)malloc(block_count * 2*clean_entry_bitmap_size);
if (!clean_bitmap)
throw std::runtime_error("Failed to allocate memory for the metadata sparse write bitmap");
}

View File

@ -105,7 +105,7 @@ uint8_t* blockstore_impl_t::get_clean_entry_bitmap(uint64_t block_loc, int offse
clean_entry_bitmap = (uint8_t*)(metadata_buffer + sector + pos*clean_entry_size + sizeof(clean_disk_entry) + offset);
}
else
clean_entry_bitmap = (uint8_t*)(clean_bitmap + meta_loc*(clean_entry_bitmap_size + entry_attr_size) + offset);
clean_entry_bitmap = (uint8_t*)(clean_bitmap + meta_loc*2*clean_entry_bitmap_size + offset);
return clean_entry_bitmap;
}
@ -151,8 +151,8 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
result_version = dirty_it->first.version;
if (read_op->bitmap)
{
void *bmp_ptr = (entry_attr_size > sizeof(void*) ? dirty_it->second.bitmap : &dirty_it->second.bitmap);
memcpy(read_op->bitmap, bmp_ptr, entry_attr_size);
void *bmp_ptr = (clean_entry_bitmap_size > sizeof(void*) ? dirty_it->second.bitmap : &dirty_it->second.bitmap);
memcpy(read_op->bitmap, bmp_ptr, clean_entry_bitmap_size);
}
}
if (!fulfill_read(read_op, fulfilled, dirty.offset, dirty.offset + dirty.len,
@ -178,7 +178,7 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
if (read_op->bitmap)
{
void *bmp_ptr = get_clean_entry_bitmap(clean_it->second.location, clean_entry_bitmap_size);
memcpy(read_op->bitmap, bmp_ptr, entry_attr_size);
memcpy(read_op->bitmap, bmp_ptr, clean_entry_bitmap_size);
}
}
if (fulfilled < read_op->len)

View File

@ -268,7 +268,7 @@ void blockstore_impl_t::erase_dirty(blockstore_dirty_db_t::iterator dirty_start,
{
journal.used_sectors.erase(dirty_it->second.journal_sector);
}
if (entry_attr_size > sizeof(void*))
if (clean_entry_bitmap_size > sizeof(void*))
{
free(dirty_it->second.bitmap);
dirty_it->second.bitmap = NULL;

View File

@ -10,9 +10,9 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
bool wait_big = false, wait_del = false;
void *bmp = NULL;
uint64_t version = 1;
if (!is_del && entry_attr_size > sizeof(void*))
if (!is_del && clean_entry_bitmap_size > sizeof(void*))
{
bmp = calloc_or_die(1, entry_attr_size);
bmp = calloc_or_die(1, clean_entry_bitmap_size);
}
if (dirty_db.size() > 0)
{
@ -30,8 +30,8 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
wait_big = (dirty_it->second.state & BS_ST_TYPE_MASK) == BS_ST_BIG_WRITE
? !IS_SYNCED(dirty_it->second.state)
: ((dirty_it->second.state & BS_ST_WORKFLOW_MASK) == BS_ST_WAIT_BIG);
if (entry_attr_size > sizeof(void*))
memcpy(bmp, dirty_it->second.bitmap, entry_attr_size);
if (clean_entry_bitmap_size > sizeof(void*))
memcpy(bmp, dirty_it->second.bitmap, clean_entry_bitmap_size);
else
bmp = dirty_it->second.bitmap;
}
@ -43,7 +43,7 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
{
version = clean_it->second.version + 1;
void *bmp_ptr = get_clean_entry_bitmap(clean_it->second.location, clean_entry_bitmap_size);
memcpy((entry_attr_size > sizeof(void*) ? bmp : &bmp), bmp_ptr, entry_attr_size);
memcpy((clean_entry_bitmap_size > sizeof(void*) ? bmp : &bmp), bmp_ptr, clean_entry_bitmap_size);
}
else
{
@ -83,7 +83,7 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
{
// Invalid version requested
op->retval = -EEXIST;
if (!is_del && entry_attr_size > sizeof(void*))
if (!is_del && clean_entry_bitmap_size > sizeof(void*))
{
free(bmp);
}
@ -127,7 +127,7 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
if (op->bitmap)
{
// Only allow to overwrite part of the object bitmap respective to the write's offset/len
uint8_t *bmp_ptr = (uint8_t*)(entry_attr_size > sizeof(void*) ? bmp : &bmp);
uint8_t *bmp_ptr = (uint8_t*)(clean_entry_bitmap_size > sizeof(void*) ? bmp : &bmp);
uint32_t bit = op->offset/bitmap_granularity;
uint32_t bits_left = op->len/bitmap_granularity;
while (!(bit % 8) && bits_left > 8)
@ -166,7 +166,7 @@ void blockstore_impl_t::cancel_all_writes(blockstore_op_t *op, blockstore_dirty_
{
while (dirty_it != dirty_db.end() && dirty_it->first.oid == op->oid)
{
if (entry_attr_size > sizeof(void*))
if (clean_entry_bitmap_size > sizeof(void*))
free(dirty_it->second.bitmap);
dirty_db.erase(dirty_it++);
}
@ -345,7 +345,7 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
// Then pre-fill journal entry
journal_entry_small_write *je = (journal_entry_small_write*)prefill_single_journal_entry(
journal, op->opcode == BS_OP_WRITE_STABLE ? JE_SMALL_WRITE_INSTANT : JE_SMALL_WRITE,
sizeof(journal_entry_small_write) + entry_attr_size
sizeof(journal_entry_small_write) + clean_entry_bitmap_size
);
dirty_it->second.journal_sector = journal.sector_info[journal.cur_sector].offset;
journal.used_sectors[journal.sector_info[journal.cur_sector].offset]++;
@ -364,7 +364,7 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
je->len = op->len;
je->data_offset = journal.next_free;
je->crc32_data = crc32c(0, op->buf, op->len);
memcpy((void*)(je+1), (entry_attr_size > sizeof(void*) ? dirty_it->second.bitmap : &dirty_it->second.bitmap), entry_attr_size);
memcpy((void*)(je+1), (clean_entry_bitmap_size > sizeof(void*) ? dirty_it->second.bitmap : &dirty_it->second.bitmap), clean_entry_bitmap_size);
je->crc32 = je_crc32((journal_entry*)je);
journal.crc32_last = je->crc32;
if (immediate_commit != IMMEDIATE_NONE)
@ -437,7 +437,7 @@ resume_2:
BS_SUBMIT_GET_SQE_DECL(sqe);
je = (journal_entry_big_write*)prefill_single_journal_entry(
journal, op->opcode == BS_OP_WRITE_STABLE ? JE_BIG_WRITE_INSTANT : JE_BIG_WRITE,
sizeof(journal_entry_big_write) + entry_attr_size
sizeof(journal_entry_big_write) + clean_entry_bitmap_size
);
dirty_it->second.journal_sector = journal.sector_info[journal.cur_sector].offset;
journal.used_sectors[journal.sector_info[journal.cur_sector].offset]++;
@ -453,7 +453,7 @@ resume_2:
je->offset = op->offset;
je->len = op->len;
je->location = dirty_it->second.location;
memcpy((void*)(je+1), (entry_attr_size > sizeof(void*) ? dirty_it->second.bitmap : &dirty_it->second.bitmap), entry_attr_size);
memcpy((void*)(je+1), (clean_entry_bitmap_size > sizeof(void*) ? dirty_it->second.bitmap : &dirty_it->second.bitmap), clean_entry_bitmap_size);
je->crc32 = je_crc32((journal_entry*)je);
journal.crc32_last = je->crc32;
prepare_journal_sector_write(journal, journal.cur_sector, sqe,

View File

@ -18,10 +18,7 @@ osd_t::osd_t(blockstore_config_t & config, ring_loop_t *ringloop)
bs_block_size = DEFAULT_BLOCK_SIZE;
if (!bs_bitmap_granularity)
bs_bitmap_granularity = DEFAULT_BITMAP_GRANULARITY;
// Force external bitmap size
entry_attr_size = bs_block_size / bs_bitmap_granularity / 8;
config["entry_attr_size"] = std::to_string(entry_attr_size);
clean_entry_bitmap_size = bs_block_size / bs_bitmap_granularity / 8;
this->config = config;
this->ringloop = ringloop;

View File

@ -126,7 +126,7 @@ class osd_t
bool stopping = false;
int inflight_ops = 0;
blockstore_t *bs;
uint32_t bs_block_size, bs_bitmap_granularity, entry_attr_size;
uint32_t bs_block_size, bs_bitmap_granularity, clean_entry_bitmap_size;
ring_loop_t *ringloop;
timerfd_manager_t *tfd = NULL;
epoll_manager_t *epmgr = NULL;

View File

@ -53,7 +53,7 @@ bool osd_t::prepare_primary_rw(osd_op_t *cur_op)
}
int stripe_count = (pool_cfg.scheme == POOL_SCHEME_REPLICATED ? 1 : pg_it->second.pg_size);
osd_primary_op_data_t *op_data = (osd_primary_op_data_t*)calloc_or_die(
1, sizeof(osd_primary_op_data_t) + (entry_attr_size + sizeof(osd_rmw_stripe_t)) * stripe_count
1, sizeof(osd_primary_op_data_t) + (clean_entry_bitmap_size + sizeof(osd_rmw_stripe_t)) * stripe_count
);
op_data->pg_num = pg_num;
op_data->oid = oid;
@ -65,7 +65,7 @@ bool osd_t::prepare_primary_rw(osd_op_t *cur_op)
// Allocate bitmaps along with stripes to avoid extra allocations and fragmentation
for (int i = 0; i < stripe_count; i++)
{
op_data->stripes[i].bmp_buf = (void*)(op_data->stripes+stripe_count) + entry_attr_size*i;
op_data->stripes[i].bmp_buf = (void*)(op_data->stripes+stripe_count) + clean_entry_bitmap_size*i;
}
pg_it->second.inflight++;
return true;
@ -154,18 +154,18 @@ resume_2:
finish_op(cur_op, op_data->epipe > 0 ? -EPIPE : -EIO);
return;
}
cur_op->reply.rw.bitmap_len = op_data->pg_data_size * entry_attr_size;
cur_op->reply.rw.bitmap_len = op_data->pg_data_size * clean_entry_bitmap_size;
if (op_data->degraded)
{
// Reconstruct missing stripes
osd_rmw_stripe_t *stripes = op_data->stripes;
if (op_data->scheme == POOL_SCHEME_XOR)
{
reconstruct_stripes_xor(stripes, op_data->pg_size, entry_attr_size);
reconstruct_stripes_xor(stripes, op_data->pg_size, clean_entry_bitmap_size);
}
else if (op_data->scheme == POOL_SCHEME_JERASURE)
{
reconstruct_stripes_jerasure(stripes, op_data->pg_size, op_data->pg_data_size, entry_attr_size);
reconstruct_stripes_jerasure(stripes, op_data->pg_size, op_data->pg_data_size, clean_entry_bitmap_size);
}
cur_op->iov.push_back(op_data->stripes[0].bmp_buf, cur_op->reply.rw.bitmap_len);
for (int role = 0; role < op_data->pg_size; role++)

View File

@ -155,7 +155,7 @@ void osd_t::submit_primary_subops(int submit_type, uint64_t op_version, int pg_s
clock_gettime(CLOCK_REALTIME, &subops[i].tv_begin);
subops[i].op_type = (uint64_t)cur_op;
subops[i].bitmap = stripes[stripe_num].bmp_buf;
subops[i].bitmap_len = entry_attr_size;
subops[i].bitmap_len = clean_entry_bitmap_size;
subops[i].bs_op = new blockstore_op_t({
.opcode = (uint64_t)(wr ? (rep ? BS_OP_WRITE_STABLE : BS_OP_WRITE) : BS_OP_READ),
.callback = [subop = &subops[i], this](blockstore_op_t *bs_subop)
@ -186,7 +186,7 @@ void osd_t::submit_primary_subops(int submit_type, uint64_t op_version, int pg_s
subops[i].op_type = OSD_OP_OUT;
subops[i].peer_fd = c_cli.osd_peer_fds.at(role_osd_num);
subops[i].bitmap = stripes[stripe_num].bmp_buf;
subops[i].bitmap_len = entry_attr_size;
subops[i].bitmap_len = clean_entry_bitmap_size;
subops[i].req.sec_rw = {
.header = {
.magic = SECONDARY_OSD_OP_MAGIC,
@ -200,7 +200,7 @@ void osd_t::submit_primary_subops(int submit_type, uint64_t op_version, int pg_s
.version = op_version,
.offset = wr ? stripes[stripe_num].write_start : stripes[stripe_num].read_start,
.len = wr ? stripes[stripe_num].write_end - stripes[stripe_num].write_start : stripes[stripe_num].read_end - stripes[stripe_num].read_start,
.attr_len = wr ? entry_attr_size : 0,
.attr_len = wr ? clean_entry_bitmap_size : 0,
};
#ifdef OSD_DEBUG
printf(

View File

@ -78,7 +78,7 @@ resume_1:
else
{
cur_op->rmw_buf = calc_rmw(cur_op->buf, op_data->stripes, op_data->prev_set,
pg.pg_size, op_data->pg_data_size, pg.pg_cursize, pg.cur_set.data(), bs_block_size, entry_attr_size);
pg.pg_size, op_data->pg_data_size, pg.pg_cursize, pg.cur_set.data(), bs_block_size, clean_entry_bitmap_size);
if (!cur_op->rmw_buf)
{
// Refuse partial overwrite of an incomplete object
@ -123,11 +123,11 @@ resume_3:
// Recover missing stripes, calculate parity
if (pg.scheme == POOL_SCHEME_XOR)
{
calc_rmw_parity_xor(op_data->stripes, pg.pg_size, op_data->prev_set, pg.cur_set.data(), bs_block_size, entry_attr_size);
calc_rmw_parity_xor(op_data->stripes, pg.pg_size, op_data->prev_set, pg.cur_set.data(), bs_block_size, clean_entry_bitmap_size);
}
else if (pg.scheme == POOL_SCHEME_JERASURE)
{
calc_rmw_parity_jerasure(op_data->stripes, pg.pg_size, op_data->pg_data_size, op_data->prev_set, pg.cur_set.data(), bs_block_size, entry_attr_size);
calc_rmw_parity_jerasure(op_data->stripes, pg.pg_size, op_data->pg_data_size, op_data->prev_set, pg.cur_set.data(), bs_block_size, clean_entry_bitmap_size);
}
}
// Send writes

View File

@ -20,7 +20,7 @@ void osd_t::secondary_op_callback(osd_op_t *op)
if (op->req.hdr.opcode == OSD_OP_SEC_READ)
{
if (op->bs_op->retval >= 0)
op->reply.sec_rw.attr_len = entry_attr_size;
op->reply.sec_rw.attr_len = clean_entry_bitmap_size;
else
op->reply.sec_rw.attr_len = 0;
if (op->bs_op->retval > 0)
@ -62,8 +62,8 @@ void osd_t::exec_secondary(osd_op_t *cur_op)
if (cur_op->req.hdr.opcode == OSD_OP_SEC_READ)
{
// Allocate memory for the read operation
if (entry_attr_size > sizeof(unsigned))
cur_op->bitmap = cur_op->rmw_buf = malloc_or_die(entry_attr_size);
if (clean_entry_bitmap_size > sizeof(unsigned))
cur_op->bitmap = cur_op->rmw_buf = malloc_or_die(clean_entry_bitmap_size);
else
cur_op->bitmap = &cur_op->bmp_data;
if (cur_op->req.sec_rw.len > 0)