Comments about stabilize operation, track unsynced_writes correctly

2019-11-11 02:53:19 +03:00 · 2019-11-11 02:53:19 +03:00 · d2d8d6e7fb
parent 8edb9e9d6f
commit d2d8d6e7fb
4 changed files with 69 additions and 40 deletions
--- a/blockstore.h
+++ b/blockstore.h
@ -275,9 +275,9 @@ class blockstore
    // Read
    int dequeue_read(blockstore_operation *read_op);
-    int fulfill_read(blockstore_operation *read_op, uint32_t item_start, uint32_t item_end,
+    int fulfill_read(blockstore_operation *read_op, uint64_t &fulfilled, uint32_t item_start, uint32_t item_end,
        uint32_t item_state, uint64_t item_version, uint64_t item_location);
-    int fulfill_read_push(blockstore_operation *read_op, uint32_t item_start,
+    int fulfill_read_push(blockstore_operation *read_op, uint64_t &fulfilled, uint32_t item_start,
        uint32_t item_state, uint64_t item_version, uint64_t item_location, uint32_t cur_start, uint32_t cur_end);
    void handle_read_event(ring_data_t *data, blockstore_operation *op);
@ -294,6 +294,7 @@ class blockstore
    // Stable
    int dequeue_stable(blockstore_operation *op);
    int continue_stable(blockstore_operation *op);
    void handle_stable_event(ring_data_t *data, blockstore_operation *op);
 public:
--- a/blockstore_read.cpp
+++ b/blockstore_read.cpp
@ -1,6 +1,6 @@
 #include "blockstore.h"
-int blockstore::fulfill_read_push(blockstore_operation *op, uint32_t item_start,
+int blockstore::fulfill_read_push(blockstore_operation *op, uint64_t &fulfilled, uint32_t item_start,
    uint32_t item_state, uint64_t item_version, uint64_t item_location, uint32_t cur_start, uint32_t cur_end)
 {
    if (cur_end > cur_start)
@ -31,11 +31,12 @@ int blockstore::fulfill_read_push(blockstore_operation *op, uint32_t item_start,
            (IS_JOURNAL(item_state) ? journal.offset : data_offset) + item_location + cur_start - item_start
        );
        data->op = op;
        fulfilled += cur_end-cur_start;
    }
    return 1;
 }
-int blockstore::fulfill_read(blockstore_operation *read_op, uint32_t item_start, uint32_t item_end,
+int blockstore::fulfill_read(blockstore_operation *read_op, uint64_t &fulfilled, uint32_t item_start, uint32_t item_end,
    uint32_t item_state, uint64_t item_version, uint64_t item_location)
 {
    uint32_t cur_start = item_start;
@ -54,14 +55,14 @@ int blockstore::fulfill_read(blockstore_operation *read_op, uint32_t item_start,
        }
        while (fulfill_near != read_op->read_vec.end() && fulfill_near->first < item_end)
        {
-            if (!fulfill_read_push(read_op, item_start, item_state, item_version, item_location, cur_start, fulfill_near->first))
+            if (!fulfill_read_push(read_op, fulfilled, item_start, item_state, item_version, item_location, cur_start, fulfill_near->first))
            {
                return 0;
            }
            cur_start = fulfill_near->first + fulfill_near->second.iov_len;
            fulfill_near++;
        }
-        if (!fulfill_read_push(read_op, item_start, item_state, item_version, item_location, cur_start, item_end))
+        if (!fulfill_read_push(read_op, fulfilled, item_start, item_state, item_version, item_location, cur_start, item_end))
        {
            return 0;
        }
@ -87,16 +88,22 @@ int blockstore::dequeue_read(blockstore_operation *read_op)
        read_op->callback(read_op);
        return 1;
    }
    // FIXME track fulfilled and stop when it is equal to read_op->len
    uint64_t fulfilled = 0;
    if (dirty_found)
    {
        while (dirty_it->first.oid == read_op->oid)
        {
            dirty_entry& dirty = dirty_it->second;
-            if (IS_STABLE(dirty.state) || read_op->version >= dirty_it->first.version)
+            bool version_ok = read_op->version >= dirty_it->first.version;
            if (IS_STABLE(dirty.state))
            {
-                if (!fulfill_read(read_op, dirty.offset, dirty.offset + dirty.size,
+                if (!version_ok && read_op->version != 0)
                    read_op->version = dirty_it->first.version;
                version_ok = true;
            }
            if (version_ok)
            {
                if (!fulfill_read(read_op, fulfilled, dirty.offset, dirty.offset + dirty.size,
                    dirty.state, dirty_it->first.version, dirty.location))
                {
                    // need to wait. undo added requests, don't dequeue op
@ -104,12 +111,16 @@ int blockstore::dequeue_read(blockstore_operation *read_op)
                    return 0;
                }
            }
            if (fulfilled == read_op->len)
            {
                break;
            }
            dirty_it--;
        }
    }
    if (clean_it != clean_db.end())
    {
-        if (!fulfill_read(read_op, 0, block_size, ST_CURRENT, 0, clean_it->second.location))
+        if (!fulfill_read(read_op, fulfilled, 0, block_size, ST_CURRENT, 0, clean_it->second.location))
        {
            // need to wait. undo added requests, don't dequeue op
            read_op->read_vec.clear();
--- a/blockstore_stable.cpp
+++ b/blockstore_stable.cpp
@ -1,5 +1,29 @@
 #include "blockstore.h"
 // Stabilize small write:
 // 1) Copy data from the journal to the data device
 //    Sync it before writing metadata if we want to keep metadata consistent
 //    Overall it's optional because it can be replayed from the journal until
 //    it's cleared, and reads are also fulfilled from the journal
 // 2) Increase version on the metadata device and sync it
 // 3) Advance clean_db entry's version, clear previous journal entries
 //
 // This makes 1 4K small write+sync look like:
 // 512b+4K (journal) + sync + 512b (journal) + sync + 4K (data) [+ sync?] + 512b (metadata) + sync.
 // WA = 2.375. It's not the best, SSD FTL-like redirect-write with defragmentation
 // could probably be lower even with defragmentation. But it's fixed and it's still
 // better than in Ceph. :)
 // Stabilize big write:
 // 1) Copy metadata from the journal to the metadata device
 // 2) Move dirty_db entry to clean_db and clear previous journal entries
 //
 // This makes 1 128K big write+sync look like:
 // 128K (data) + sync + 512b (journal) + sync + 512b (journal) + sync + 512b (metadata) + sync.
 // WA = 1.012. Very good :)
 // AND We must do it in batches, for the sake of reduced fsync call count
 int blockstore::dequeue_stable(blockstore_operation *op)
 {
    auto dirty_it = dirty_db.find((obj_ver_id){
@ -61,6 +85,11 @@ int blockstore::dequeue_stable(blockstore_operation *op)
    return 1;
 }
 int blockstore::continue_stable(blockstore_operation *op)
 {
    return 0;
 }
 void blockstore::handle_stable_event(ring_data_t *data, blockstore_operation *op)
 {
    if (data->res < 0)
@ -72,21 +101,13 @@ void blockstore::handle_stable_event(ring_data_t *data, blockstore_operation *op
    op->pending_ops--;
    if (op->pending_ops == 0)
    {
-        // Mark dirty_db entry as stable
+        // Mark all dirty_db entries up to op->version as stable
        auto dirty_it = dirty_db.find((obj_ver_id){
            .oid = op->oid,
            .version = op->version,
        });
        if (dirty_it->second.state == ST_J_SYNCED)
        {
            // 1) Copy data from the journal to the data device
            // 2) Increase version on the metadata device
            // 3) Advance clean_db entry's version, clear previous journal entries
            // This makes 1 4K small write+sync look like:
            // 512b+4K (journal) + sync + 512b (journal) + sync + 512b (metadata) + 4K (data) + sync.
            // WA = 2.375. It's not the best, SSD FTL-like redirect-write with defragmentation
            // could probably be lower even with defragmentation. But it's fixed and it's still
            // better than in Ceph. :)
            dirty_it->second.state = ST_J_STABLE;
            // Acknowledge op
            op->retval = 0;
@ -94,15 +115,18 @@ void blockstore::handle_stable_event(ring_data_t *data, blockstore_operation *op
        }
        else if (dirty_it->second.state == ST_D_META_SYNCED)
        {
            // 1) Copy metadata from the journal to the metadata device
            // 2) Move dirty_db entry to clean_db and clear previous journal entries
            // This makes 1 128K big write+sync look like:
            // 128K (data) + sync + 512b (journal) + sync + 512b (journal) + sync + 512b (metadata) + sync.
            // WA = 1.012. Very good :)
            dirty_it->second.state = ST_D_STABLE;
            // Acknowledge op
            op->retval = 0;
            op->callback(op);
        }
        else if (dirty_it->second.state == ST_J_STABLE)
        {
        }
        else if (dirty_it->second.state == ST_D_STABLE)
        {
        }
    }
 }
--- a/blockstore_write.cpp
+++ b/blockstore_write.cpp
@ -66,6 +66,11 @@ int blockstore::dequeue_write(blockstore_operation *op)
        );
        op->pending_ops = 1;
        op->min_used_journal_sector = op->max_used_journal_sector = 0;
        // Remember write as unsynced
        unsynced_big_writes.push_back((obj_ver_id){
            .oid = op->oid,
            .version = op->version,
        });
    }
    else
    {
@ -111,6 +116,10 @@ int blockstore::dequeue_write(blockstore_operation *op)
        dirty_it->second.state = ST_J_SUBMITTED;
        journal.next_free += op->len;
        op->pending_ops = 2;
        unsynced_small_writes.push_back((obj_ver_id){
            .oid = op->oid,
            .version = op->version,
        });
    }
    return 1;
 }
@ -155,21 +164,5 @@ void blockstore::handle_write_event(ring_data_t *data, blockstore_operation *op)
        // Acknowledge write without sync
        op->retval = op->len;
        op->callback(op);
        // Remember write as unsynced
        // FIXME: Could state change to ST_STABLE? It could break this check
        if (IS_BIG_WRITE(dirty_entry.state))
        {
            unsynced_big_writes.push_back((obj_ver_id){
                .oid = op->oid,
                .version = op->version,
            });
        }
        else
        {
            unsynced_small_writes.push_back((obj_ver_id){
                .oid = op->oid,
                .version = op->version,
            });
        }
    }
 }