From 2a02f3c4c7329e0fd387ccbff9f2c56c336a325a Mon Sep 17 00:00:00 2001 From: Vitaliy Filippov Date: Sat, 10 Apr 2021 17:18:10 +0300 Subject: [PATCH] Add metadata superblock and check it on start Refuse to start if the superblock is missing or bad version; zero out the metadata area when initializing superblock. --- mon/simple-offsets.js | 5 +- src/blockstore_impl.h | 17 +++++ src/blockstore_init.cpp | 154 +++++++++++++++++++++++++++++++++------- src/blockstore_init.h | 1 + src/blockstore_open.cpp | 2 +- 5 files changed, 152 insertions(+), 27 deletions(-) diff --git a/mon/simple-offsets.js b/mon/simple-offsets.js index 419e5fb7..dd933776 100644 --- a/mon/simple-offsets.js +++ b/mon/simple-offsets.js @@ -51,7 +51,7 @@ async function run() const meta_offset = options.journal_offset + Math.ceil(options.journal_size/options.device_block_size)*options.device_block_size; const entries_per_block = Math.floor(options.device_block_size / (24 + 2*options.object_size/options.bitmap_granularity/8)); const object_count = Math.floor((device_size-meta_offset)/options.object_size); - const meta_size = Math.ceil(object_count / entries_per_block) * options.device_block_size; + const meta_size = Math.ceil(1 + object_count / entries_per_block) * options.device_block_size; const data_offset = meta_offset + meta_size; const meta_size_fmt = (meta_size > 1024*1024*1024 ? Math.round(meta_size/1024/1024/1024*100)/100+" GB" : Math.round(meta_size/1024/1024*100)/100+" MB"); @@ -65,6 +65,9 @@ async function run() ); } process.stdout.write( + (options.device_block_size != 4096 ? + ` --meta_block_size ${options.device}\n`+ + ` --journal_block-size ${options.device}\n` : '')+ ` --data_device ${options.device}\n`+ ` --journal_offset ${options.journal_offset}\n`+ ` --meta_offset ${meta_offset}\n`+ diff --git a/src/blockstore_impl.h b/src/blockstore_impl.h index f4e9e734..a7aa1590 100644 --- a/src/blockstore_impl.h +++ b/src/blockstore_impl.h @@ -78,6 +78,23 @@ #include "blockstore_journal.h" +// "VITAstor" +#define BLOCKSTORE_META_MAGIC 0x726F747341544956l +#define BLOCKSTORE_META_VERSION 1 + +// metadata header (superblock) +// FIXME: After adding the OSD superblock, add a key to metadata +// and journal headers to check if they belong to the same OSD +struct __attribute__((__packed__)) blockstore_meta_header_t +{ + uint64_t zero; + uint64_t magic; + uint64_t version; + uint32_t meta_block_size; + uint32_t data_block_size; + uint32_t bitmap_granularity; +}; + // 32 bytes = 24 bytes + block bitmap (4 bytes by default) + external attributes (also bitmap, 4 bytes by default) // per "clean" entry on disk with fixed metadata tables // FIXME: maybe add crc32's to metadata diff --git a/src/blockstore_init.cpp b/src/blockstore_init.cpp index 5c066b37..ccc3d4ac 100644 --- a/src/blockstore_init.cpp +++ b/src/blockstore_init.cpp @@ -3,6 +3,20 @@ #include "blockstore_impl.h" +#define GET_SQE() \ + sqe = bs->get_sqe();\ + if (!sqe)\ + throw std::runtime_error("io_uring is full during initialization");\ + data = ((ring_data_t*)sqe->user_data) + +static bool iszero(uint64_t *buf, int len) +{ + for (int i = 0; i < len; i++) + if (buf[i] != 0) + return false; + return true; +} + blockstore_init_meta::blockstore_init_meta(blockstore_impl_t *bs) { this->bs = bs; @@ -10,7 +24,7 @@ blockstore_init_meta::blockstore_init_meta(blockstore_impl_t *bs) void blockstore_init_meta::handle_event(ring_data_t *data) { - if (data->res <= 0) + if (data->res < 0) { throw std::runtime_error( std::string("read metadata failed at offset ") + std::to_string(metadata_read) + @@ -28,6 +42,12 @@ int blockstore_init_meta::loop() { if (wait_state == 1) goto resume_1; + else if (wait_state == 2) + goto resume_2; + else if (wait_state == 3) + goto resume_3; + else if (wait_state == 4) + goto resume_4; printf("Reading blockstore metadata\n"); if (bs->inmemory_meta) metadata_buffer = bs->metadata_buffer; @@ -35,22 +55,98 @@ int blockstore_init_meta::loop() metadata_buffer = memalign(MEM_ALIGNMENT, 2*bs->metadata_buf_size); if (!metadata_buffer) throw std::runtime_error("Failed to allocate metadata read buffer"); + // Read superblock + GET_SQE(); + data->iov = { metadata_buffer, bs->meta_block_size }; + data->callback = [this](ring_data_t *data) { handle_event(data); }; + my_uring_prep_readv(sqe, bs->meta_fd, &data->iov, 1, bs->meta_offset); + bs->ringloop->submit(); + submitted = 1; +resume_1: + if (submitted) + { + wait_state = 1; + return 1; + } + if (iszero((uint64_t*)metadata_buffer, bs->meta_block_size / sizeof(uint64_t))) + { + { + blockstore_meta_header_t *hdr = (blockstore_meta_header_t *)metadata_buffer; + hdr->zero = 0; + hdr->magic = BLOCKSTORE_META_MAGIC; + hdr->version = BLOCKSTORE_META_VERSION; + hdr->meta_block_size = bs->meta_block_size; + hdr->data_block_size = bs->block_size; + hdr->bitmap_granularity = bs->bitmap_granularity; + } + if (bs->readonly) + { + printf("Skipping metadata initialization because blockstore is readonly\n"); + } + else + { + printf("Initializing metadata area\n"); + GET_SQE(); + data->iov = (struct iovec){ metadata_buffer, bs->meta_block_size }; + data->callback = [this](ring_data_t *data) { handle_event(data); }; + my_uring_prep_writev(sqe, bs->meta_fd, &data->iov, 1, bs->meta_offset); + bs->ringloop->submit(); + submitted = 1; + resume_3: + if (submitted > 0) + { + wait_state = 3; + return 1; + } + zero_on_init = true; + } + } + else + { + blockstore_meta_header_t *hdr = (blockstore_meta_header_t *)metadata_buffer; + if (hdr->zero != 0 || + hdr->magic != BLOCKSTORE_META_MAGIC || + hdr->version != BLOCKSTORE_META_VERSION) + { + printf( + "Metadata is corrupt or old version.\n" + " If this is a new OSD please zero out the metadata area before starting it.\n" + " If you need to upgrade from 0.5.x please request it via the issue tracker.\n" + ); + exit(1); + } + if (hdr->meta_block_size != bs->meta_block_size || + hdr->data_block_size != bs->block_size || + hdr->bitmap_granularity != bs->bitmap_granularity) + { + printf( + "Configuration stored in metadata superblock" + " (meta_block_size=%u, data_block_size=%u, bitmap_granularity=%u)" + " differs from OSD configuration (%lu/%u/%lu).\n", + hdr->meta_block_size, hdr->data_block_size, hdr->bitmap_granularity, + bs->meta_block_size, bs->block_size, bs->bitmap_granularity + ); + exit(1); + } + } + // Skip superblock + bs->meta_offset += bs->meta_block_size; + prev_done = 0; + done_len = 0; + done_pos = 0; + metadata_read = 0; + // Read the rest of the metadata while (1) { - resume_1: + resume_2: if (submitted) { - wait_state = 1; + wait_state = 2; return 1; } if (metadata_read < bs->meta_len) { - sqe = bs->get_sqe(); - if (!sqe) - { - throw std::runtime_error("io_uring is full while trying to read metadata"); - } - data = ((ring_data_t*)sqe->user_data); + GET_SQE(); data->iov = { metadata_buffer + (bs->inmemory_meta ? metadata_read @@ -58,7 +154,14 @@ int blockstore_init_meta::loop() bs->meta_len - metadata_read > bs->metadata_buf_size ? bs->metadata_buf_size : bs->meta_len - metadata_read, }; data->callback = [this](ring_data_t *data) { handle_event(data); }; - my_uring_prep_readv(sqe, bs->meta_fd, &data->iov, 1, bs->meta_offset + metadata_read); + if (!zero_on_init) + my_uring_prep_readv(sqe, bs->meta_fd, &data->iov, 1, bs->meta_offset + metadata_read); + else + { + // Fill metadata with zeroes + memset(data->iov.iov_base, 0, data->iov.iov_len); + my_uring_prep_writev(sqe, bs->meta_fd, &data->iov, 1, bs->meta_offset + metadata_read); + } bs->ringloop->submit(); submitted = (prev == 1 ? 2 : 1); prev = submitted; @@ -90,6 +193,21 @@ int blockstore_init_meta::loop() free(metadata_buffer); metadata_buffer = NULL; } + if (zero_on_init && !bs->disable_meta_fsync) + { + GET_SQE(); + my_uring_prep_fsync(sqe, bs->meta_fd, IORING_FSYNC_DATASYNC); + data->iov = { 0 }; + data->callback = [this](ring_data_t *data) { handle_event(data); }; + submitted = 1; + bs->ringloop->submit(); + resume_4: + if (submitted > 0) + { + wait_state = 4; + return 1; + } + } return 0; } @@ -156,14 +274,6 @@ blockstore_init_journal::blockstore_init_journal(blockstore_impl_t *bs) }; } -bool iszero(uint64_t *buf, int len) -{ - for (int i = 0; i < len; i++) - if (buf[i] != 0) - return false; - return true; -} - void blockstore_init_journal::handle_event(ring_data_t *data1) { if (data1->res <= 0) @@ -188,12 +298,6 @@ void blockstore_init_journal::handle_event(ring_data_t *data1) submitted_buf = NULL; } -#define GET_SQE() \ - sqe = bs->get_sqe();\ - if (!sqe)\ - throw std::runtime_error("io_uring is full while trying to read journal");\ - data = ((ring_data_t*)sqe->user_data) - int blockstore_init_journal::loop() { if (wait_state == 1) @@ -231,7 +335,7 @@ resume_1: wait_state = 1; return 1; } - if (iszero((uint64_t*)submitted_buf, bs->journal.block_size)) + if (iszero((uint64_t*)submitted_buf, bs->journal.block_size / sizeof(uint64_t))) { // Journal is empty // FIXME handle this wrapping to journal_block_size better (maybe) diff --git a/src/blockstore_init.h b/src/blockstore_init.h index cce032da..c5a21b62 100644 --- a/src/blockstore_init.h +++ b/src/blockstore_init.h @@ -7,6 +7,7 @@ class blockstore_init_meta { blockstore_impl_t *bs; int wait_state = 0, wait_count = 0; + bool zero_on_init = false; void *metadata_buffer = NULL; uint64_t metadata_read = 0; int prev = 0, prev_done = 0, done_len = 0, submitted = 0; diff --git a/src/blockstore_open.cpp b/src/blockstore_open.cpp index 5c97c842..766e3ad0 100644 --- a/src/blockstore_open.cpp +++ b/src/blockstore_open.cpp @@ -257,7 +257,7 @@ void blockstore_impl_t::calc_lengths() } // required metadata size block_count = data_len / block_size; - meta_len = ((block_count - 1 + meta_block_size / clean_entry_size) / (meta_block_size / clean_entry_size)) * meta_block_size; + meta_len = (1 + (block_count - 1 + meta_block_size / clean_entry_size) / (meta_block_size / clean_entry_size)) * meta_block_size; if (meta_area < meta_len) { throw std::runtime_error("Metadata area is too small, need at least "+std::to_string(meta_len)+" bytes");