Comments about stabilize operation, track unsynced_writes correctly

blocking-uring-test
Vitaliy Filippov 2019-11-11 02:53:19 +03:00
parent 8edb9e9d6f
commit d2d8d6e7fb
4 changed files with 69 additions and 40 deletions

View File

@ -275,9 +275,9 @@ class blockstore
// Read
int dequeue_read(blockstore_operation *read_op);
int fulfill_read(blockstore_operation *read_op, uint32_t item_start, uint32_t item_end,
int fulfill_read(blockstore_operation *read_op, uint64_t &fulfilled, uint32_t item_start, uint32_t item_end,
uint32_t item_state, uint64_t item_version, uint64_t item_location);
int fulfill_read_push(blockstore_operation *read_op, uint32_t item_start,
int fulfill_read_push(blockstore_operation *read_op, uint64_t &fulfilled, uint32_t item_start,
uint32_t item_state, uint64_t item_version, uint64_t item_location, uint32_t cur_start, uint32_t cur_end);
void handle_read_event(ring_data_t *data, blockstore_operation *op);
@ -294,6 +294,7 @@ class blockstore
// Stable
int dequeue_stable(blockstore_operation *op);
int continue_stable(blockstore_operation *op);
void handle_stable_event(ring_data_t *data, blockstore_operation *op);
public:

View File

@ -1,6 +1,6 @@
#include "blockstore.h"
int blockstore::fulfill_read_push(blockstore_operation *op, uint32_t item_start,
int blockstore::fulfill_read_push(blockstore_operation *op, uint64_t &fulfilled, uint32_t item_start,
uint32_t item_state, uint64_t item_version, uint64_t item_location, uint32_t cur_start, uint32_t cur_end)
{
if (cur_end > cur_start)
@ -31,11 +31,12 @@ int blockstore::fulfill_read_push(blockstore_operation *op, uint32_t item_start,
(IS_JOURNAL(item_state) ? journal.offset : data_offset) + item_location + cur_start - item_start
);
data->op = op;
fulfilled += cur_end-cur_start;
}
return 1;
}
int blockstore::fulfill_read(blockstore_operation *read_op, uint32_t item_start, uint32_t item_end,
int blockstore::fulfill_read(blockstore_operation *read_op, uint64_t &fulfilled, uint32_t item_start, uint32_t item_end,
uint32_t item_state, uint64_t item_version, uint64_t item_location)
{
uint32_t cur_start = item_start;
@ -54,14 +55,14 @@ int blockstore::fulfill_read(blockstore_operation *read_op, uint32_t item_start,
}
while (fulfill_near != read_op->read_vec.end() && fulfill_near->first < item_end)
{
if (!fulfill_read_push(read_op, item_start, item_state, item_version, item_location, cur_start, fulfill_near->first))
if (!fulfill_read_push(read_op, fulfilled, item_start, item_state, item_version, item_location, cur_start, fulfill_near->first))
{
return 0;
}
cur_start = fulfill_near->first + fulfill_near->second.iov_len;
fulfill_near++;
}
if (!fulfill_read_push(read_op, item_start, item_state, item_version, item_location, cur_start, item_end))
if (!fulfill_read_push(read_op, fulfilled, item_start, item_state, item_version, item_location, cur_start, item_end))
{
return 0;
}
@ -87,16 +88,22 @@ int blockstore::dequeue_read(blockstore_operation *read_op)
read_op->callback(read_op);
return 1;
}
// FIXME track fulfilled and stop when it is equal to read_op->len
uint64_t fulfilled = 0;
if (dirty_found)
{
while (dirty_it->first.oid == read_op->oid)
{
dirty_entry& dirty = dirty_it->second;
if (IS_STABLE(dirty.state) || read_op->version >= dirty_it->first.version)
bool version_ok = read_op->version >= dirty_it->first.version;
if (IS_STABLE(dirty.state))
{
if (!fulfill_read(read_op, dirty.offset, dirty.offset + dirty.size,
if (!version_ok && read_op->version != 0)
read_op->version = dirty_it->first.version;
version_ok = true;
}
if (version_ok)
{
if (!fulfill_read(read_op, fulfilled, dirty.offset, dirty.offset + dirty.size,
dirty.state, dirty_it->first.version, dirty.location))
{
// need to wait. undo added requests, don't dequeue op
@ -104,12 +111,16 @@ int blockstore::dequeue_read(blockstore_operation *read_op)
return 0;
}
}
if (fulfilled == read_op->len)
{
break;
}
dirty_it--;
}
}
if (clean_it != clean_db.end())
{
if (!fulfill_read(read_op, 0, block_size, ST_CURRENT, 0, clean_it->second.location))
if (!fulfill_read(read_op, fulfilled, 0, block_size, ST_CURRENT, 0, clean_it->second.location))
{
// need to wait. undo added requests, don't dequeue op
read_op->read_vec.clear();

View File

@ -1,5 +1,29 @@
#include "blockstore.h"
// Stabilize small write:
// 1) Copy data from the journal to the data device
// Sync it before writing metadata if we want to keep metadata consistent
// Overall it's optional because it can be replayed from the journal until
// it's cleared, and reads are also fulfilled from the journal
// 2) Increase version on the metadata device and sync it
// 3) Advance clean_db entry's version, clear previous journal entries
//
// This makes 1 4K small write+sync look like:
// 512b+4K (journal) + sync + 512b (journal) + sync + 4K (data) [+ sync?] + 512b (metadata) + sync.
// WA = 2.375. It's not the best, SSD FTL-like redirect-write with defragmentation
// could probably be lower even with defragmentation. But it's fixed and it's still
// better than in Ceph. :)
// Stabilize big write:
// 1) Copy metadata from the journal to the metadata device
// 2) Move dirty_db entry to clean_db and clear previous journal entries
//
// This makes 1 128K big write+sync look like:
// 128K (data) + sync + 512b (journal) + sync + 512b (journal) + sync + 512b (metadata) + sync.
// WA = 1.012. Very good :)
// AND We must do it in batches, for the sake of reduced fsync call count
int blockstore::dequeue_stable(blockstore_operation *op)
{
auto dirty_it = dirty_db.find((obj_ver_id){
@ -61,6 +85,11 @@ int blockstore::dequeue_stable(blockstore_operation *op)
return 1;
}
int blockstore::continue_stable(blockstore_operation *op)
{
return 0;
}
void blockstore::handle_stable_event(ring_data_t *data, blockstore_operation *op)
{
if (data->res < 0)
@ -72,21 +101,13 @@ void blockstore::handle_stable_event(ring_data_t *data, blockstore_operation *op
op->pending_ops--;
if (op->pending_ops == 0)
{
// Mark dirty_db entry as stable
// Mark all dirty_db entries up to op->version as stable
auto dirty_it = dirty_db.find((obj_ver_id){
.oid = op->oid,
.version = op->version,
});
if (dirty_it->second.state == ST_J_SYNCED)
{
// 1) Copy data from the journal to the data device
// 2) Increase version on the metadata device
// 3) Advance clean_db entry's version, clear previous journal entries
// This makes 1 4K small write+sync look like:
// 512b+4K (journal) + sync + 512b (journal) + sync + 512b (metadata) + 4K (data) + sync.
// WA = 2.375. It's not the best, SSD FTL-like redirect-write with defragmentation
// could probably be lower even with defragmentation. But it's fixed and it's still
// better than in Ceph. :)
dirty_it->second.state = ST_J_STABLE;
// Acknowledge op
op->retval = 0;
@ -94,15 +115,18 @@ void blockstore::handle_stable_event(ring_data_t *data, blockstore_operation *op
}
else if (dirty_it->second.state == ST_D_META_SYNCED)
{
// 1) Copy metadata from the journal to the metadata device
// 2) Move dirty_db entry to clean_db and clear previous journal entries
// This makes 1 128K big write+sync look like:
// 128K (data) + sync + 512b (journal) + sync + 512b (journal) + sync + 512b (metadata) + sync.
// WA = 1.012. Very good :)
dirty_it->second.state = ST_D_STABLE;
// Acknowledge op
op->retval = 0;
op->callback(op);
}
else if (dirty_it->second.state == ST_J_STABLE)
{
}
else if (dirty_it->second.state == ST_D_STABLE)
{
}
}
}

View File

@ -66,6 +66,11 @@ int blockstore::dequeue_write(blockstore_operation *op)
);
op->pending_ops = 1;
op->min_used_journal_sector = op->max_used_journal_sector = 0;
// Remember write as unsynced
unsynced_big_writes.push_back((obj_ver_id){
.oid = op->oid,
.version = op->version,
});
}
else
{
@ -111,6 +116,10 @@ int blockstore::dequeue_write(blockstore_operation *op)
dirty_it->second.state = ST_J_SUBMITTED;
journal.next_free += op->len;
op->pending_ops = 2;
unsynced_small_writes.push_back((obj_ver_id){
.oid = op->oid,
.version = op->version,
});
}
return 1;
}
@ -155,21 +164,5 @@ void blockstore::handle_write_event(ring_data_t *data, blockstore_operation *op)
// Acknowledge write without sync
op->retval = op->len;
op->callback(op);
// Remember write as unsynced
// FIXME: Could state change to ST_STABLE? It could break this check
if (IS_BIG_WRITE(dirty_entry.state))
{
unsynced_big_writes.push_back((obj_ver_id){
.oid = op->oid,
.version = op->version,
});
}
else
{
unsynced_small_writes.push_back((obj_ver_id){
.oid = op->oid,
.version = op->version,
});
}
}
}