From d2d8d6e7fb791e179773217f1e53d9af47aacc0e Mon Sep 17 00:00:00 2001 From: Vitaliy Filippov Date: Mon, 11 Nov 2019 02:53:19 +0300 Subject: [PATCH] Comments about stabilize operation, track unsynced_writes correctly --- blockstore.h | 5 +++-- blockstore_read.cpp | 27 +++++++++++++++------- blockstore_stable.cpp | 52 +++++++++++++++++++++++++++++++------------ blockstore_write.cpp | 25 ++++++++------------- 4 files changed, 69 insertions(+), 40 deletions(-) diff --git a/blockstore.h b/blockstore.h index 25706cf5..8ac61235 100644 --- a/blockstore.h +++ b/blockstore.h @@ -275,9 +275,9 @@ class blockstore // Read int dequeue_read(blockstore_operation *read_op); - int fulfill_read(blockstore_operation *read_op, uint32_t item_start, uint32_t item_end, + int fulfill_read(blockstore_operation *read_op, uint64_t &fulfilled, uint32_t item_start, uint32_t item_end, uint32_t item_state, uint64_t item_version, uint64_t item_location); - int fulfill_read_push(blockstore_operation *read_op, uint32_t item_start, + int fulfill_read_push(blockstore_operation *read_op, uint64_t &fulfilled, uint32_t item_start, uint32_t item_state, uint64_t item_version, uint64_t item_location, uint32_t cur_start, uint32_t cur_end); void handle_read_event(ring_data_t *data, blockstore_operation *op); @@ -294,6 +294,7 @@ class blockstore // Stable int dequeue_stable(blockstore_operation *op); + int continue_stable(blockstore_operation *op); void handle_stable_event(ring_data_t *data, blockstore_operation *op); public: diff --git a/blockstore_read.cpp b/blockstore_read.cpp index c7b975d1..d62c0703 100644 --- a/blockstore_read.cpp +++ b/blockstore_read.cpp @@ -1,6 +1,6 @@ #include "blockstore.h" -int blockstore::fulfill_read_push(blockstore_operation *op, uint32_t item_start, +int blockstore::fulfill_read_push(blockstore_operation *op, uint64_t &fulfilled, uint32_t item_start, uint32_t item_state, uint64_t item_version, uint64_t item_location, uint32_t cur_start, uint32_t cur_end) { if (cur_end > cur_start) @@ -31,11 +31,12 @@ int blockstore::fulfill_read_push(blockstore_operation *op, uint32_t item_start, (IS_JOURNAL(item_state) ? journal.offset : data_offset) + item_location + cur_start - item_start ); data->op = op; + fulfilled += cur_end-cur_start; } return 1; } -int blockstore::fulfill_read(blockstore_operation *read_op, uint32_t item_start, uint32_t item_end, +int blockstore::fulfill_read(blockstore_operation *read_op, uint64_t &fulfilled, uint32_t item_start, uint32_t item_end, uint32_t item_state, uint64_t item_version, uint64_t item_location) { uint32_t cur_start = item_start; @@ -54,14 +55,14 @@ int blockstore::fulfill_read(blockstore_operation *read_op, uint32_t item_start, } while (fulfill_near != read_op->read_vec.end() && fulfill_near->first < item_end) { - if (!fulfill_read_push(read_op, item_start, item_state, item_version, item_location, cur_start, fulfill_near->first)) + if (!fulfill_read_push(read_op, fulfilled, item_start, item_state, item_version, item_location, cur_start, fulfill_near->first)) { return 0; } cur_start = fulfill_near->first + fulfill_near->second.iov_len; fulfill_near++; } - if (!fulfill_read_push(read_op, item_start, item_state, item_version, item_location, cur_start, item_end)) + if (!fulfill_read_push(read_op, fulfilled, item_start, item_state, item_version, item_location, cur_start, item_end)) { return 0; } @@ -87,16 +88,22 @@ int blockstore::dequeue_read(blockstore_operation *read_op) read_op->callback(read_op); return 1; } - // FIXME track fulfilled and stop when it is equal to read_op->len uint64_t fulfilled = 0; if (dirty_found) { while (dirty_it->first.oid == read_op->oid) { dirty_entry& dirty = dirty_it->second; - if (IS_STABLE(dirty.state) || read_op->version >= dirty_it->first.version) + bool version_ok = read_op->version >= dirty_it->first.version; + if (IS_STABLE(dirty.state)) { - if (!fulfill_read(read_op, dirty.offset, dirty.offset + dirty.size, + if (!version_ok && read_op->version != 0) + read_op->version = dirty_it->first.version; + version_ok = true; + } + if (version_ok) + { + if (!fulfill_read(read_op, fulfilled, dirty.offset, dirty.offset + dirty.size, dirty.state, dirty_it->first.version, dirty.location)) { // need to wait. undo added requests, don't dequeue op @@ -104,12 +111,16 @@ int blockstore::dequeue_read(blockstore_operation *read_op) return 0; } } + if (fulfilled == read_op->len) + { + break; + } dirty_it--; } } if (clean_it != clean_db.end()) { - if (!fulfill_read(read_op, 0, block_size, ST_CURRENT, 0, clean_it->second.location)) + if (!fulfill_read(read_op, fulfilled, 0, block_size, ST_CURRENT, 0, clean_it->second.location)) { // need to wait. undo added requests, don't dequeue op read_op->read_vec.clear(); diff --git a/blockstore_stable.cpp b/blockstore_stable.cpp index a27aa218..ea2594be 100644 --- a/blockstore_stable.cpp +++ b/blockstore_stable.cpp @@ -1,5 +1,29 @@ #include "blockstore.h" +// Stabilize small write: +// 1) Copy data from the journal to the data device +// Sync it before writing metadata if we want to keep metadata consistent +// Overall it's optional because it can be replayed from the journal until +// it's cleared, and reads are also fulfilled from the journal +// 2) Increase version on the metadata device and sync it +// 3) Advance clean_db entry's version, clear previous journal entries +// +// This makes 1 4K small write+sync look like: +// 512b+4K (journal) + sync + 512b (journal) + sync + 4K (data) [+ sync?] + 512b (metadata) + sync. +// WA = 2.375. It's not the best, SSD FTL-like redirect-write with defragmentation +// could probably be lower even with defragmentation. But it's fixed and it's still +// better than in Ceph. :) + +// Stabilize big write: +// 1) Copy metadata from the journal to the metadata device +// 2) Move dirty_db entry to clean_db and clear previous journal entries +// +// This makes 1 128K big write+sync look like: +// 128K (data) + sync + 512b (journal) + sync + 512b (journal) + sync + 512b (metadata) + sync. +// WA = 1.012. Very good :) + +// AND We must do it in batches, for the sake of reduced fsync call count + int blockstore::dequeue_stable(blockstore_operation *op) { auto dirty_it = dirty_db.find((obj_ver_id){ @@ -61,6 +85,11 @@ int blockstore::dequeue_stable(blockstore_operation *op) return 1; } +int blockstore::continue_stable(blockstore_operation *op) +{ + return 0; +} + void blockstore::handle_stable_event(ring_data_t *data, blockstore_operation *op) { if (data->res < 0) @@ -72,21 +101,13 @@ void blockstore::handle_stable_event(ring_data_t *data, blockstore_operation *op op->pending_ops--; if (op->pending_ops == 0) { - // Mark dirty_db entry as stable + // Mark all dirty_db entries up to op->version as stable auto dirty_it = dirty_db.find((obj_ver_id){ .oid = op->oid, .version = op->version, }); if (dirty_it->second.state == ST_J_SYNCED) { - // 1) Copy data from the journal to the data device - // 2) Increase version on the metadata device - // 3) Advance clean_db entry's version, clear previous journal entries - // This makes 1 4K small write+sync look like: - // 512b+4K (journal) + sync + 512b (journal) + sync + 512b (metadata) + 4K (data) + sync. - // WA = 2.375. It's not the best, SSD FTL-like redirect-write with defragmentation - // could probably be lower even with defragmentation. But it's fixed and it's still - // better than in Ceph. :) dirty_it->second.state = ST_J_STABLE; // Acknowledge op op->retval = 0; @@ -94,15 +115,18 @@ void blockstore::handle_stable_event(ring_data_t *data, blockstore_operation *op } else if (dirty_it->second.state == ST_D_META_SYNCED) { - // 1) Copy metadata from the journal to the metadata device - // 2) Move dirty_db entry to clean_db and clear previous journal entries - // This makes 1 128K big write+sync look like: - // 128K (data) + sync + 512b (journal) + sync + 512b (journal) + sync + 512b (metadata) + sync. - // WA = 1.012. Very good :) dirty_it->second.state = ST_D_STABLE; // Acknowledge op op->retval = 0; op->callback(op); } + else if (dirty_it->second.state == ST_J_STABLE) + { + + } + else if (dirty_it->second.state == ST_D_STABLE) + { + + } } } diff --git a/blockstore_write.cpp b/blockstore_write.cpp index 703ff8ec..9b08c8c9 100644 --- a/blockstore_write.cpp +++ b/blockstore_write.cpp @@ -66,6 +66,11 @@ int blockstore::dequeue_write(blockstore_operation *op) ); op->pending_ops = 1; op->min_used_journal_sector = op->max_used_journal_sector = 0; + // Remember write as unsynced + unsynced_big_writes.push_back((obj_ver_id){ + .oid = op->oid, + .version = op->version, + }); } else { @@ -111,6 +116,10 @@ int blockstore::dequeue_write(blockstore_operation *op) dirty_it->second.state = ST_J_SUBMITTED; journal.next_free += op->len; op->pending_ops = 2; + unsynced_small_writes.push_back((obj_ver_id){ + .oid = op->oid, + .version = op->version, + }); } return 1; } @@ -155,21 +164,5 @@ void blockstore::handle_write_event(ring_data_t *data, blockstore_operation *op) // Acknowledge write without sync op->retval = op->len; op->callback(op); - // Remember write as unsynced - // FIXME: Could state change to ST_STABLE? It could break this check - if (IS_BIG_WRITE(dirty_entry.state)) - { - unsynced_big_writes.push_back((obj_ver_id){ - .oid = op->oid, - .version = op->version, - }); - } - else - { - unsynced_small_writes.push_back((obj_ver_id){ - .oid = op->oid, - .version = op->version, - }); - } } }