Add metadata superblock and check it on start

Refuse to start if the superblock is missing or bad version;
zero out the metadata area when initializing superblock.
rdma-zerocopy
Vitaliy Filippov 2021-04-10 17:18:10 +03:00
parent f684d9101a
commit 2a02f3c4c7
5 changed files with 152 additions and 27 deletions

View File

@ -51,7 +51,7 @@ async function run()
const meta_offset = options.journal_offset + Math.ceil(options.journal_size/options.device_block_size)*options.device_block_size; const meta_offset = options.journal_offset + Math.ceil(options.journal_size/options.device_block_size)*options.device_block_size;
const entries_per_block = Math.floor(options.device_block_size / (24 + 2*options.object_size/options.bitmap_granularity/8)); const entries_per_block = Math.floor(options.device_block_size / (24 + 2*options.object_size/options.bitmap_granularity/8));
const object_count = Math.floor((device_size-meta_offset)/options.object_size); const object_count = Math.floor((device_size-meta_offset)/options.object_size);
const meta_size = Math.ceil(object_count / entries_per_block) * options.device_block_size; const meta_size = Math.ceil(1 + object_count / entries_per_block) * options.device_block_size;
const data_offset = meta_offset + meta_size; const data_offset = meta_offset + meta_size;
const meta_size_fmt = (meta_size > 1024*1024*1024 ? Math.round(meta_size/1024/1024/1024*100)/100+" GB" const meta_size_fmt = (meta_size > 1024*1024*1024 ? Math.round(meta_size/1024/1024/1024*100)/100+" GB"
: Math.round(meta_size/1024/1024*100)/100+" MB"); : Math.round(meta_size/1024/1024*100)/100+" MB");
@ -65,6 +65,9 @@ async function run()
); );
} }
process.stdout.write( process.stdout.write(
(options.device_block_size != 4096 ?
` --meta_block_size ${options.device}\n`+
` --journal_block-size ${options.device}\n` : '')+
` --data_device ${options.device}\n`+ ` --data_device ${options.device}\n`+
` --journal_offset ${options.journal_offset}\n`+ ` --journal_offset ${options.journal_offset}\n`+
` --meta_offset ${meta_offset}\n`+ ` --meta_offset ${meta_offset}\n`+

View File

@ -78,6 +78,23 @@
#include "blockstore_journal.h" #include "blockstore_journal.h"
// "VITAstor"
#define BLOCKSTORE_META_MAGIC 0x726F747341544956l
#define BLOCKSTORE_META_VERSION 1
// metadata header (superblock)
// FIXME: After adding the OSD superblock, add a key to metadata
// and journal headers to check if they belong to the same OSD
struct __attribute__((__packed__)) blockstore_meta_header_t
{
uint64_t zero;
uint64_t magic;
uint64_t version;
uint32_t meta_block_size;
uint32_t data_block_size;
uint32_t bitmap_granularity;
};
// 32 bytes = 24 bytes + block bitmap (4 bytes by default) + external attributes (also bitmap, 4 bytes by default) // 32 bytes = 24 bytes + block bitmap (4 bytes by default) + external attributes (also bitmap, 4 bytes by default)
// per "clean" entry on disk with fixed metadata tables // per "clean" entry on disk with fixed metadata tables
// FIXME: maybe add crc32's to metadata // FIXME: maybe add crc32's to metadata

View File

@ -3,6 +3,20 @@
#include "blockstore_impl.h" #include "blockstore_impl.h"
#define GET_SQE() \
sqe = bs->get_sqe();\
if (!sqe)\
throw std::runtime_error("io_uring is full during initialization");\
data = ((ring_data_t*)sqe->user_data)
static bool iszero(uint64_t *buf, int len)
{
for (int i = 0; i < len; i++)
if (buf[i] != 0)
return false;
return true;
}
blockstore_init_meta::blockstore_init_meta(blockstore_impl_t *bs) blockstore_init_meta::blockstore_init_meta(blockstore_impl_t *bs)
{ {
this->bs = bs; this->bs = bs;
@ -10,7 +24,7 @@ blockstore_init_meta::blockstore_init_meta(blockstore_impl_t *bs)
void blockstore_init_meta::handle_event(ring_data_t *data) void blockstore_init_meta::handle_event(ring_data_t *data)
{ {
if (data->res <= 0) if (data->res < 0)
{ {
throw std::runtime_error( throw std::runtime_error(
std::string("read metadata failed at offset ") + std::to_string(metadata_read) + std::string("read metadata failed at offset ") + std::to_string(metadata_read) +
@ -28,6 +42,12 @@ int blockstore_init_meta::loop()
{ {
if (wait_state == 1) if (wait_state == 1)
goto resume_1; goto resume_1;
else if (wait_state == 2)
goto resume_2;
else if (wait_state == 3)
goto resume_3;
else if (wait_state == 4)
goto resume_4;
printf("Reading blockstore metadata\n"); printf("Reading blockstore metadata\n");
if (bs->inmemory_meta) if (bs->inmemory_meta)
metadata_buffer = bs->metadata_buffer; metadata_buffer = bs->metadata_buffer;
@ -35,22 +55,98 @@ int blockstore_init_meta::loop()
metadata_buffer = memalign(MEM_ALIGNMENT, 2*bs->metadata_buf_size); metadata_buffer = memalign(MEM_ALIGNMENT, 2*bs->metadata_buf_size);
if (!metadata_buffer) if (!metadata_buffer)
throw std::runtime_error("Failed to allocate metadata read buffer"); throw std::runtime_error("Failed to allocate metadata read buffer");
// Read superblock
GET_SQE();
data->iov = { metadata_buffer, bs->meta_block_size };
data->callback = [this](ring_data_t *data) { handle_event(data); };
my_uring_prep_readv(sqe, bs->meta_fd, &data->iov, 1, bs->meta_offset);
bs->ringloop->submit();
submitted = 1;
resume_1:
if (submitted)
{
wait_state = 1;
return 1;
}
if (iszero((uint64_t*)metadata_buffer, bs->meta_block_size / sizeof(uint64_t)))
{
{
blockstore_meta_header_t *hdr = (blockstore_meta_header_t *)metadata_buffer;
hdr->zero = 0;
hdr->magic = BLOCKSTORE_META_MAGIC;
hdr->version = BLOCKSTORE_META_VERSION;
hdr->meta_block_size = bs->meta_block_size;
hdr->data_block_size = bs->block_size;
hdr->bitmap_granularity = bs->bitmap_granularity;
}
if (bs->readonly)
{
printf("Skipping metadata initialization because blockstore is readonly\n");
}
else
{
printf("Initializing metadata area\n");
GET_SQE();
data->iov = (struct iovec){ metadata_buffer, bs->meta_block_size };
data->callback = [this](ring_data_t *data) { handle_event(data); };
my_uring_prep_writev(sqe, bs->meta_fd, &data->iov, 1, bs->meta_offset);
bs->ringloop->submit();
submitted = 1;
resume_3:
if (submitted > 0)
{
wait_state = 3;
return 1;
}
zero_on_init = true;
}
}
else
{
blockstore_meta_header_t *hdr = (blockstore_meta_header_t *)metadata_buffer;
if (hdr->zero != 0 ||
hdr->magic != BLOCKSTORE_META_MAGIC ||
hdr->version != BLOCKSTORE_META_VERSION)
{
printf(
"Metadata is corrupt or old version.\n"
" If this is a new OSD please zero out the metadata area before starting it.\n"
" If you need to upgrade from 0.5.x please request it via the issue tracker.\n"
);
exit(1);
}
if (hdr->meta_block_size != bs->meta_block_size ||
hdr->data_block_size != bs->block_size ||
hdr->bitmap_granularity != bs->bitmap_granularity)
{
printf(
"Configuration stored in metadata superblock"
" (meta_block_size=%u, data_block_size=%u, bitmap_granularity=%u)"
" differs from OSD configuration (%lu/%u/%lu).\n",
hdr->meta_block_size, hdr->data_block_size, hdr->bitmap_granularity,
bs->meta_block_size, bs->block_size, bs->bitmap_granularity
);
exit(1);
}
}
// Skip superblock
bs->meta_offset += bs->meta_block_size;
prev_done = 0;
done_len = 0;
done_pos = 0;
metadata_read = 0;
// Read the rest of the metadata
while (1) while (1)
{ {
resume_1: resume_2:
if (submitted) if (submitted)
{ {
wait_state = 1; wait_state = 2;
return 1; return 1;
} }
if (metadata_read < bs->meta_len) if (metadata_read < bs->meta_len)
{ {
sqe = bs->get_sqe(); GET_SQE();
if (!sqe)
{
throw std::runtime_error("io_uring is full while trying to read metadata");
}
data = ((ring_data_t*)sqe->user_data);
data->iov = { data->iov = {
metadata_buffer + (bs->inmemory_meta metadata_buffer + (bs->inmemory_meta
? metadata_read ? metadata_read
@ -58,7 +154,14 @@ int blockstore_init_meta::loop()
bs->meta_len - metadata_read > bs->metadata_buf_size ? bs->metadata_buf_size : bs->meta_len - metadata_read, bs->meta_len - metadata_read > bs->metadata_buf_size ? bs->metadata_buf_size : bs->meta_len - metadata_read,
}; };
data->callback = [this](ring_data_t *data) { handle_event(data); }; data->callback = [this](ring_data_t *data) { handle_event(data); };
my_uring_prep_readv(sqe, bs->meta_fd, &data->iov, 1, bs->meta_offset + metadata_read); if (!zero_on_init)
my_uring_prep_readv(sqe, bs->meta_fd, &data->iov, 1, bs->meta_offset + metadata_read);
else
{
// Fill metadata with zeroes
memset(data->iov.iov_base, 0, data->iov.iov_len);
my_uring_prep_writev(sqe, bs->meta_fd, &data->iov, 1, bs->meta_offset + metadata_read);
}
bs->ringloop->submit(); bs->ringloop->submit();
submitted = (prev == 1 ? 2 : 1); submitted = (prev == 1 ? 2 : 1);
prev = submitted; prev = submitted;
@ -90,6 +193,21 @@ int blockstore_init_meta::loop()
free(metadata_buffer); free(metadata_buffer);
metadata_buffer = NULL; metadata_buffer = NULL;
} }
if (zero_on_init && !bs->disable_meta_fsync)
{
GET_SQE();
my_uring_prep_fsync(sqe, bs->meta_fd, IORING_FSYNC_DATASYNC);
data->iov = { 0 };
data->callback = [this](ring_data_t *data) { handle_event(data); };
submitted = 1;
bs->ringloop->submit();
resume_4:
if (submitted > 0)
{
wait_state = 4;
return 1;
}
}
return 0; return 0;
} }
@ -156,14 +274,6 @@ blockstore_init_journal::blockstore_init_journal(blockstore_impl_t *bs)
}; };
} }
bool iszero(uint64_t *buf, int len)
{
for (int i = 0; i < len; i++)
if (buf[i] != 0)
return false;
return true;
}
void blockstore_init_journal::handle_event(ring_data_t *data1) void blockstore_init_journal::handle_event(ring_data_t *data1)
{ {
if (data1->res <= 0) if (data1->res <= 0)
@ -188,12 +298,6 @@ void blockstore_init_journal::handle_event(ring_data_t *data1)
submitted_buf = NULL; submitted_buf = NULL;
} }
#define GET_SQE() \
sqe = bs->get_sqe();\
if (!sqe)\
throw std::runtime_error("io_uring is full while trying to read journal");\
data = ((ring_data_t*)sqe->user_data)
int blockstore_init_journal::loop() int blockstore_init_journal::loop()
{ {
if (wait_state == 1) if (wait_state == 1)
@ -231,7 +335,7 @@ resume_1:
wait_state = 1; wait_state = 1;
return 1; return 1;
} }
if (iszero((uint64_t*)submitted_buf, bs->journal.block_size)) if (iszero((uint64_t*)submitted_buf, bs->journal.block_size / sizeof(uint64_t)))
{ {
// Journal is empty // Journal is empty
// FIXME handle this wrapping to journal_block_size better (maybe) // FIXME handle this wrapping to journal_block_size better (maybe)

View File

@ -7,6 +7,7 @@ class blockstore_init_meta
{ {
blockstore_impl_t *bs; blockstore_impl_t *bs;
int wait_state = 0, wait_count = 0; int wait_state = 0, wait_count = 0;
bool zero_on_init = false;
void *metadata_buffer = NULL; void *metadata_buffer = NULL;
uint64_t metadata_read = 0; uint64_t metadata_read = 0;
int prev = 0, prev_done = 0, done_len = 0, submitted = 0; int prev = 0, prev_done = 0, done_len = 0, submitted = 0;

View File

@ -257,7 +257,7 @@ void blockstore_impl_t::calc_lengths()
} }
// required metadata size // required metadata size
block_count = data_len / block_size; block_count = data_len / block_size;
meta_len = ((block_count - 1 + meta_block_size / clean_entry_size) / (meta_block_size / clean_entry_size)) * meta_block_size; meta_len = (1 + (block_count - 1 + meta_block_size / clean_entry_size) / (meta_block_size / clean_entry_size)) * meta_block_size;
if (meta_area < meta_len) if (meta_area < meta_len)
{ {
throw std::runtime_error("Metadata area is too small, need at least "+std::to_string(meta_len)+" bytes"); throw std::runtime_error("Metadata area is too small, need at least "+std::to_string(meta_len)+" bytes");