Browse Source

Add "external" bitmap support to blockstore

rdma-zerocopy
Vitaliy Filippov 7 months ago
parent
commit
6107a4d07b
  1. 5
      src/blockstore.h
  2. 9
      src/blockstore_flush.cpp
  3. 9
      src/blockstore_impl.h
  4. 36
      src/blockstore_init.cpp
  5. 6
      src/blockstore_journal.h
  6. 9
      src/blockstore_open.cpp
  7. 37
      src/blockstore_read.cpp
  8. 5
      src/blockstore_rollback.cpp
  9. 18
      src/blockstore_write.cpp
  10. 2
      src/osd_rmw.cpp

5
src/blockstore.h

@ -64,6 +64,10 @@ Input:
- offset, len = offset and length within object. length may be zero, in that case
read operation only returns the version / write operation only bumps the version
- buf = pre-allocated buffer for data (read) / with data (write). may be NULL if len == 0.
- bitmap = <entry_attr_size> bytes long arbitrary data stored for each object in the metadata area.
when <entry_attr_size> fits into pointer size, it should be passed as this field's value.
when it doesn't fit, this field should be a pointer to that piece of data.
named "bitmap" because it's used for the "external bitmap" in Vitastor.
Output:
- retval = number of bytes actually read/written or negative error number (-EINVAL or -ENOSPC)
@ -141,6 +145,7 @@ struct blockstore_op_t
uint32_t offset;
uint32_t len;
void *buf;
void *bitmap;
int retval;
uint8_t private_data[BS_OP_PRIVATE_DATA_SIZE];

9
src/blockstore_flush.cpp

@ -428,7 +428,7 @@ resume_1:
{
new_clean_bitmap = (bs->inmemory_meta
? meta_new.buf + meta_new.pos*bs->clean_entry_size + sizeof(clean_disk_entry)
: bs->clean_bitmap + (clean_loc >> bs->block_order)*bs->clean_entry_bitmap_size);
: bs->clean_bitmap + (clean_loc >> bs->block_order)*(bs->clean_entry_bitmap_size + bs->entry_attr_size));
if (clean_init_bitmap)
{
memset(new_clean_bitmap, 0, bs->clean_entry_bitmap_size);
@ -473,6 +473,7 @@ resume_1:
wait_state = 5;
return false;
}
// zero out old metadata entry
memset(meta_old.buf + meta_old.pos*bs->clean_entry_size, 0, bs->clean_entry_size);
await_sqe(15);
data->iov = (struct iovec){ meta_old.buf, bs->meta_block_size };
@ -509,6 +510,12 @@ resume_1:
{
memcpy(&new_entry->bitmap, new_clean_bitmap, bs->clean_entry_bitmap_size);
}
if (bs->entry_attr_size)
{
// copy latest external bitmap/attributes
void *bmp_ptr = bs->entry_attr_size > sizeof(void*) ? dirty_end->second.bitmap : &dirty_end->second.bitmap;
memcpy((void*)(new_entry+1) + bs->clean_entry_bitmap_size, bmp_ptr, bs->entry_attr_size);
}
}
await_sqe(6);
data->iov = (struct iovec){ meta_new.buf, bs->meta_block_size };

9
src/blockstore_impl.h

@ -77,7 +77,8 @@
#include "blockstore_journal.h"
// 24 bytes + block bitmap per "clean" entry on disk with fixed metadata tables
// 32 bytes = 24 bytes + block bitmap (4 bytes by default) + external attributes (also bitmap, 4 bytes by default)
// per "clean" entry on disk with fixed metadata tables
// FIXME: maybe add crc32's to metadata
struct __attribute__((__packed__)) clean_disk_entry
{
@ -93,7 +94,7 @@ struct __attribute__((__packed__)) clean_entry
uint64_t location;
};
// 56 = 24 + 32 bytes per dirty entry in memory (obj_ver_id => dirty_entry)
// 64 = 24 + 40 bytes per dirty entry in memory (obj_ver_id => dirty_entry)
struct __attribute__((__packed__)) dirty_entry
{
uint32_t state;
@ -102,6 +103,7 @@ struct __attribute__((__packed__)) dirty_entry
uint32_t offset; // data offset within object (stripe)
uint32_t len; // data length
uint64_t journal_sector; // journal sector used for this entry
void* bitmap; // either external bitmap itself when it fits, or a pointer to it when it doesn't
};
// - Sync must be submitted after previous writes/deletes (not before!)
@ -216,7 +218,7 @@ class blockstore_impl_t
uint32_t block_order;
uint64_t block_count;
uint32_t clean_entry_bitmap_size = 0, clean_entry_size = 0;
uint32_t clean_entry_bitmap_size = 0, clean_entry_size = 0, entry_attr_size = 0;
int meta_fd;
int data_fd;
@ -250,6 +252,7 @@ class blockstore_impl_t
void open_data();
void open_meta();
void open_journal();
uint8_t* get_clean_entry_bitmap(uint64_t block_loc, int offset);
// Asynchronous init
int initialized;

36
src/blockstore_init.cpp

@ -98,9 +98,9 @@ void blockstore_init_meta::handle_entries(void* entries, unsigned count, int blo
for (unsigned i = 0; i < count; i++)
{
clean_disk_entry *entry = (clean_disk_entry*)(entries + i*bs->clean_entry_size);
if (!bs->inmemory_meta && bs->clean_entry_bitmap_size)
if (!bs->inmemory_meta && (bs->clean_entry_bitmap_size || bs->entry_attr_size))
{
memcpy(bs->clean_bitmap + (done_cnt+i)*bs->clean_entry_bitmap_size, &entry->bitmap, bs->clean_entry_bitmap_size);
memcpy(bs->clean_bitmap + (done_cnt+i)*(bs->clean_entry_bitmap_size + bs->entry_attr_size), &entry->bitmap, (bs->clean_entry_bitmap_size + bs->entry_attr_size));
}
if (entry->oid.inode > 0)
{
@ -545,6 +545,21 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
.oid = je->small_write.oid,
.version = je->small_write.version,
};
void *bmp = (void*)je + sizeof(journal_entry_small_write);
if (bs->entry_attr_size <= sizeof(void*))
{
memcpy(&bmp, bmp, bs->entry_attr_size);
}
else if (!bs->journal.inmemory)
{
// FIXME Using large blockstore objects and not keeping journal in memory
// will result in a lot of small allocations for entry bitmaps. This can
// only be fixed by using a patched map with dynamic entry size, but not
// the btree_map, because it doesn't keep iterators valid all the time.
void *bmp_cp = malloc_or_die(bs->entry_attr_size);
memcpy(bmp_cp, bmp, bs->entry_attr_size);
bmp = bmp_cp;
}
bs->dirty_db.emplace(ov, (dirty_entry){
.state = (BS_ST_SMALL_WRITE | BS_ST_SYNCED),
.flags = 0,
@ -552,6 +567,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
.offset = je->small_write.offset,
.len = je->small_write.len,
.journal_sector = proc_pos,
.bitmap = bmp,
});
bs->journal.used_sectors[proc_pos]++;
#ifdef BLOCKSTORE_DEBUG
@ -609,6 +625,21 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
.oid = je->big_write.oid,
.version = je->big_write.version,
};
void *bmp = (void*)je + sizeof(journal_entry_big_write);
if (bs->entry_attr_size <= sizeof(void*))
{
memcpy(&bmp, bmp, bs->entry_attr_size);
}
else if (!bs->journal.inmemory)
{
// FIXME Using large blockstore objects and not keeping journal in memory
// will result in a lot of small allocations for entry bitmaps. This can
// only be fixed by using a patched map with dynamic entry size, but not
// the btree_map, because it doesn't keep iterators valid all the time.
void *bmp_cp = malloc_or_die(bs->entry_attr_size);
memcpy(bmp_cp, bmp, bs->entry_attr_size);
bmp = bmp_cp;
}
auto dirty_it = bs->dirty_db.emplace(ov, (dirty_entry){
.state = (BS_ST_BIG_WRITE | BS_ST_SYNCED),
.flags = 0,
@ -616,6 +647,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
.offset = je->big_write.offset,
.len = je->big_write.len,
.journal_sector = proc_pos,
.bitmap = bmp,
}).first;
if (bs->data_alloc->get(je->big_write.location >> bs->block_order))
{

6
src/blockstore_journal.h

@ -54,6 +54,9 @@ struct __attribute__((__packed__)) journal_entry_small_write
// data_offset is its offset within journal
uint64_t data_offset;
uint32_t crc32_data;
// small_write and big_write entries are followed by the "external" bitmap
// its size is dynamic and included in journal entry's <size> field
uint8_t bitmap[];
};
struct __attribute__((__packed__)) journal_entry_big_write
@ -68,6 +71,9 @@ struct __attribute__((__packed__)) journal_entry_big_write
uint32_t offset;
uint32_t len;
uint64_t location;
// small_write and big_write entries are followed by the "external" bitmap
// its size is dynamic and included in journal entry's <size> field
uint8_t bitmap[];
};
struct __attribute__((__packed__)) journal_entry_stable

9
src/blockstore_open.cpp

@ -62,6 +62,7 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config)
cfg_data_size = strtoull(config["data_size"].c_str(), NULL, 10);
meta_device = config["meta_device"];
meta_offset = strtoull(config["meta_offset"].c_str(), NULL, 10);
entry_attr_size = strtoull(config["entry_attr_size"].c_str(), NULL, 10);
block_size = strtoull(config["block_size"].c_str(), NULL, 10);
inmemory_meta = config["inmemory_metadata"] != "false";
journal_device = config["journal_device"];
@ -106,7 +107,7 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config)
}
else if (disk_alignment % MEM_ALIGNMENT)
{
throw std::runtime_error("disk_alingment must be a multiple of "+std::to_string(MEM_ALIGNMENT));
throw std::runtime_error("disk_alignment must be a multiple of "+std::to_string(MEM_ALIGNMENT));
}
if (!journal_block_size)
{
@ -182,7 +183,7 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config)
}
// init some fields
clean_entry_bitmap_size = block_size / bitmap_granularity / 8;
clean_entry_size = sizeof(clean_disk_entry) + clean_entry_bitmap_size;
clean_entry_size = sizeof(clean_disk_entry) + clean_entry_bitmap_size + entry_attr_size;
journal.block_size = journal_block_size;
journal.next_free = journal_block_size;
journal.used_start = journal_block_size;
@ -247,9 +248,9 @@ void blockstore_impl_t::calc_lengths()
if (!metadata_buffer)
throw std::runtime_error("Failed to allocate memory for the metadata");
}
else if (clean_entry_bitmap_size)
else if (clean_entry_bitmap_size || entry_attr_size)
{
clean_bitmap = (uint8_t*)malloc(block_count * clean_entry_bitmap_size);
clean_bitmap = (uint8_t*)malloc(block_count * (clean_entry_bitmap_size + entry_attr_size));
if (!clean_bitmap)
throw std::runtime_error("Failed to allocate memory for the metadata sparse write bitmap");
}

37
src/blockstore_read.cpp

@ -94,6 +94,21 @@ endwhile:
return 1;
}
uint8_t* blockstore_impl_t::get_clean_entry_bitmap(uint64_t block_loc, int offset)
{
uint8_t *clean_entry_bitmap;
uint64_t meta_loc = block_loc >> block_order;
if (inmemory_meta)
{
uint64_t sector = (meta_loc / (meta_block_size / clean_entry_size)) * meta_block_size;
uint64_t pos = (meta_loc % (meta_block_size / clean_entry_size));
clean_entry_bitmap = (uint8_t*)(metadata_buffer + sector + pos*clean_entry_size + sizeof(clean_disk_entry) + offset);
}
else
clean_entry_bitmap = (uint8_t*)(clean_bitmap + meta_loc*(clean_entry_bitmap_size + entry_attr_size) + offset);
return clean_entry_bitmap;
}
int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
{
auto clean_it = clean_db.find(read_op->oid);
@ -134,6 +149,10 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
if (!result_version)
{
result_version = dirty_it->first.version;
if (entry_attr_size <= sizeof(void*))
read_op->bitmap = dirty_it->second.bitmap;
else if (read_op->bitmap)
memcpy(read_op->bitmap, dirty_it->second.bitmap, entry_attr_size);
}
if (!fulfill_read(read_op, fulfilled, dirty.offset, dirty.offset + dirty.len,
dirty.state, dirty_it->first.version, dirty.location + (IS_JOURNAL(dirty.state) ? 0 : dirty.offset)))
@ -155,6 +174,11 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
if (!result_version)
{
result_version = clean_it->second.version;
void *clean_entry_bitmap = get_clean_entry_bitmap(clean_it->second.location, clean_entry_bitmap_size);
if (entry_attr_size <= sizeof(void*))
memcpy(&read_op->bitmap, clean_entry_bitmap, entry_attr_size);
else if (read_op->bitmap)
memcpy(read_op->bitmap, clean_entry_bitmap, entry_attr_size);
}
if (fulfilled < read_op->len)
{
@ -169,18 +193,7 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
}
else
{
uint64_t meta_loc = clean_it->second.location >> block_order;
uint8_t *clean_entry_bitmap;
if (inmemory_meta)
{
uint64_t sector = (meta_loc / (meta_block_size / clean_entry_size)) * meta_block_size;
uint64_t pos = (meta_loc % (meta_block_size / clean_entry_size));
clean_entry_bitmap = (uint8_t*)(metadata_buffer + sector + pos*clean_entry_size + sizeof(clean_disk_entry));
}
else
{
clean_entry_bitmap = (uint8_t*)(clean_bitmap + meta_loc*clean_entry_bitmap_size);
}
uint8_t *clean_entry_bitmap = get_clean_entry_bitmap(clean_it->second.location, 0);
uint64_t bmp_start = 0, bmp_end = 0, bmp_size = block_size/bitmap_granularity;
while (bmp_start < bmp_size)
{

5
src/blockstore_rollback.cpp

@ -268,6 +268,11 @@ void blockstore_impl_t::erase_dirty(blockstore_dirty_db_t::iterator dirty_start,
{
journal.used_sectors.erase(dirty_it->second.journal_sector);
}
if (entry_attr_size > sizeof(void*))
{
free(dirty_it->second.bitmap);
dirty_it->second.bitmap = NULL;
}
if (dirty_it == dirty_start)
{
break;

18
src/blockstore_write.cpp

@ -95,6 +95,7 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
#endif
// FIXME No strict need to add it into dirty_db here, it's just left
// from the previous implementation where reads waited for writes
void *bmp = NULL;
uint32_t state;
if (is_del)
state = BS_ST_DELETE | BS_ST_IN_FLIGHT;
@ -109,6 +110,14 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
state |= BS_ST_IN_FLIGHT;
if (op->opcode == BS_OP_WRITE_STABLE)
state |= BS_ST_INSTANT;
if (entry_attr_size > sizeof(void*))
{
bmp = calloc_or_die(1, entry_attr_size);
if (op->bitmap)
memcpy(bmp, op->bitmap, entry_attr_size);
}
else
bmp = op->bitmap;
}
dirty_db.emplace((obj_ver_id){
.oid = op->oid,
@ -120,6 +129,7 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
.offset = is_del ? 0 : op->offset,
.len = is_del ? 0 : op->len,
.journal_sector = 0,
.bitmap = bmp,
});
return true;
}
@ -128,6 +138,8 @@ void blockstore_impl_t::cancel_all_writes(blockstore_op_t *op, blockstore_dirty_
{
while (dirty_it != dirty_db.end() && dirty_it->first.oid == op->oid)
{
if (entry_attr_size > sizeof(void*))
free(dirty_it->second.bitmap);
dirty_db.erase(dirty_it++);
}
bool found = false;
@ -305,7 +317,7 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
// Then pre-fill journal entry
journal_entry_small_write *je = (journal_entry_small_write*)prefill_single_journal_entry(
journal, op->opcode == BS_OP_WRITE_STABLE ? JE_SMALL_WRITE_INSTANT : JE_SMALL_WRITE,
sizeof(journal_entry_small_write)
sizeof(journal_entry_small_write) + entry_attr_size
);
dirty_it->second.journal_sector = journal.sector_info[journal.cur_sector].offset;
journal.used_sectors[journal.sector_info[journal.cur_sector].offset]++;
@ -324,6 +336,7 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
je->len = op->len;
je->data_offset = journal.next_free;
je->crc32_data = crc32c(0, op->buf, op->len);
memcpy((void*)(je+1), (entry_attr_size > sizeof(void*) ? dirty_it->second.bitmap : &dirty_it->second.bitmap), entry_attr_size);
je->crc32 = je_crc32((journal_entry*)je);
journal.crc32_last = je->crc32;
if (immediate_commit != IMMEDIATE_NONE)
@ -396,7 +409,7 @@ resume_2:
BS_SUBMIT_GET_SQE_DECL(sqe);
je = (journal_entry_big_write*)prefill_single_journal_entry(
journal, op->opcode == BS_OP_WRITE_STABLE ? JE_BIG_WRITE_INSTANT : JE_BIG_WRITE,
sizeof(journal_entry_big_write)
sizeof(journal_entry_big_write) + entry_attr_size
);
dirty_it->second.journal_sector = journal.sector_info[journal.cur_sector].offset;
journal.used_sectors[journal.sector_info[journal.cur_sector].offset]++;
@ -412,6 +425,7 @@ resume_2:
je->offset = op->offset;
je->len = op->len;
je->location = dirty_it->second.location;
memcpy((void*)(je+1), (entry_attr_size > sizeof(void*) ? dirty_it->second.bitmap : &dirty_it->second.bitmap), entry_attr_size);
je->crc32 = je_crc32((journal_entry*)je);
journal.crc32_last = je->crc32;
prepare_journal_sector_write(journal, journal.cur_sector, sqe,

2
src/osd_rmw.cpp

@ -215,7 +215,7 @@ int* get_jerasure_decoding_matrix(osd_rmw_stripe_t *stripes, int pg_size, int pg
auto dec_it = matrix->decodings.find((reed_sol_erased_t){ .data = erased, .size = pg_size });
if (dec_it == matrix->decodings.end())
{
int *dm_ids = (int*)malloc(sizeof(int)*(pg_minsize + pg_minsize*pg_minsize + pg_size));
int *dm_ids = (int*)malloc_or_die(sizeof(int)*(pg_minsize + pg_minsize*pg_minsize + pg_size));
int *decoding_matrix = dm_ids + pg_minsize;
if (!dm_ids)
throw std::bad_alloc();

Loading…
Cancel
Save