Comments about stabilize operation, track unsynced_writes correctly

2019-11-11 02:53:19 +03:00 · 2019-11-11 02:53:19 +03:00 · d2d8d6e7fb
parent 8edb9e9d6f
commit d2d8d6e7fb
4 changed files with 69 additions and 40 deletions
--- a/blockstore.h
+++ b/blockstore.h
@ -275,9 +275,9 @@ class blockstore

    // Read
    int dequeue_read(blockstore_operation *read_op);
-    int fulfill_read(blockstore_operation *read_op, uint32_t item_start, uint32_t item_end,
+    int fulfill_read(blockstore_operation *read_op, uint64_t &fulfilled, uint32_t item_start, uint32_t item_end,
        uint32_t item_state, uint64_t item_version, uint64_t item_location);
-    int fulfill_read_push(blockstore_operation *read_op, uint32_t item_start,
+    int fulfill_read_push(blockstore_operation *read_op, uint64_t &fulfilled, uint32_t item_start,
        uint32_t item_state, uint64_t item_version, uint64_t item_location, uint32_t cur_start, uint32_t cur_end);
    void handle_read_event(ring_data_t *data, blockstore_operation *op);

@ -294,6 +294,7 @@ class blockstore

    // Stable
    int dequeue_stable(blockstore_operation *op);
+    int continue_stable(blockstore_operation *op);
    void handle_stable_event(ring_data_t *data, blockstore_operation *op);

 public:
--- a/blockstore_read.cpp
+++ b/blockstore_read.cpp
@ -1,6 +1,6 @@
 #include "blockstore.h"

-int blockstore::fulfill_read_push(blockstore_operation *op, uint32_t item_start,
+int blockstore::fulfill_read_push(blockstore_operation *op, uint64_t &fulfilled, uint32_t item_start,
    uint32_t item_state, uint64_t item_version, uint64_t item_location, uint32_t cur_start, uint32_t cur_end)
 {
    if (cur_end > cur_start)
@ -31,11 +31,12 @@ int blockstore::fulfill_read_push(blockstore_operation *op, uint32_t item_start,
            (IS_JOURNAL(item_state) ? journal.offset : data_offset) + item_location + cur_start - item_start
        );
        data->op = op;
+        fulfilled += cur_end-cur_start;
    }
    return 1;
 }

-int blockstore::fulfill_read(blockstore_operation *read_op, uint32_t item_start, uint32_t item_end,
+int blockstore::fulfill_read(blockstore_operation *read_op, uint64_t &fulfilled, uint32_t item_start, uint32_t item_end,
    uint32_t item_state, uint64_t item_version, uint64_t item_location)
 {
    uint32_t cur_start = item_start;
@ -54,14 +55,14 @@ int blockstore::fulfill_read(blockstore_operation *read_op, uint32_t item_start,
        }
        while (fulfill_near != read_op->read_vec.end() && fulfill_near->first < item_end)
        {
-            if (!fulfill_read_push(read_op, item_start, item_state, item_version, item_location, cur_start, fulfill_near->first))
+            if (!fulfill_read_push(read_op, fulfilled, item_start, item_state, item_version, item_location, cur_start, fulfill_near->first))
            {
                return 0;
            }
            cur_start = fulfill_near->first + fulfill_near->second.iov_len;
            fulfill_near++;
        }
-        if (!fulfill_read_push(read_op, item_start, item_state, item_version, item_location, cur_start, item_end))
+        if (!fulfill_read_push(read_op, fulfilled, item_start, item_state, item_version, item_location, cur_start, item_end))
        {
            return 0;
        }
@ -87,16 +88,22 @@ int blockstore::dequeue_read(blockstore_operation *read_op)
        read_op->callback(read_op);
        return 1;
    }
-    // FIXME track fulfilled and stop when it is equal to read_op->len
    uint64_t fulfilled = 0;
    if (dirty_found)
    {
        while (dirty_it->first.oid == read_op->oid)
        {
            dirty_entry& dirty = dirty_it->second;
-            if (IS_STABLE(dirty.state) || read_op->version >= dirty_it->first.version)
+            bool version_ok = read_op->version >= dirty_it->first.version;
+            if (IS_STABLE(dirty.state))
            {
-                if (!fulfill_read(read_op, dirty.offset, dirty.offset + dirty.size,
+                if (!version_ok && read_op->version != 0)
+                    read_op->version = dirty_it->first.version;
+                version_ok = true;
+            }
+            if (version_ok)
+            {
+                if (!fulfill_read(read_op, fulfilled, dirty.offset, dirty.offset + dirty.size,
                    dirty.state, dirty_it->first.version, dirty.location))
                {
                    // need to wait. undo added requests, don't dequeue op
@ -104,12 +111,16 @@ int blockstore::dequeue_read(blockstore_operation *read_op)
                    return 0;
                }
            }
+            if (fulfilled == read_op->len)
+            {
+                break;
+            }
            dirty_it--;
        }
    }
    if (clean_it != clean_db.end())
    {
-        if (!fulfill_read(read_op, 0, block_size, ST_CURRENT, 0, clean_it->second.location))
+        if (!fulfill_read(read_op, fulfilled, 0, block_size, ST_CURRENT, 0, clean_it->second.location))
        {
            // need to wait. undo added requests, don't dequeue op
            read_op->read_vec.clear();
--- a/blockstore_stable.cpp
+++ b/blockstore_stable.cpp
@ -1,5 +1,29 @@
 #include "blockstore.h"

+// Stabilize small write:
+// 1) Copy data from the journal to the data device
+//    Sync it before writing metadata if we want to keep metadata consistent
+//    Overall it's optional because it can be replayed from the journal until
+//    it's cleared, and reads are also fulfilled from the journal
+// 2) Increase version on the metadata device and sync it
+// 3) Advance clean_db entry's version, clear previous journal entries
+//
+// This makes 1 4K small write+sync look like:
+// 512b+4K (journal) + sync + 512b (journal) + sync + 4K (data) [+ sync?] + 512b (metadata) + sync.
+// WA = 2.375. It's not the best, SSD FTL-like redirect-write with defragmentation
+// could probably be lower even with defragmentation. But it's fixed and it's still
+// better than in Ceph. :)
+
+// Stabilize big write:
+// 1) Copy metadata from the journal to the metadata device
+// 2) Move dirty_db entry to clean_db and clear previous journal entries
+//
+// This makes 1 128K big write+sync look like:
+// 128K (data) + sync + 512b (journal) + sync + 512b (journal) + sync + 512b (metadata) + sync.
+// WA = 1.012. Very good :)
+
+// AND We must do it in batches, for the sake of reduced fsync call count
+
 int blockstore::dequeue_stable(blockstore_operation *op)
 {
    auto dirty_it = dirty_db.find((obj_ver_id){
@ -61,6 +85,11 @@ int blockstore::dequeue_stable(blockstore_operation *op)
    return 1;
 }

+int blockstore::continue_stable(blockstore_operation *op)
+{
+    return 0;
+}
+
 void blockstore::handle_stable_event(ring_data_t *data, blockstore_operation *op)
 {
    if (data->res < 0)
@ -72,21 +101,13 @@ void blockstore::handle_stable_event(ring_data_t *data, blockstore_operation *op
    op->pending_ops--;
    if (op->pending_ops == 0)
    {
-        // Mark dirty_db entry as stable
+        // Mark all dirty_db entries up to op->version as stable
        auto dirty_it = dirty_db.find((obj_ver_id){
            .oid = op->oid,
            .version = op->version,
        });
        if (dirty_it->second.state == ST_J_SYNCED)
        {
-            // 1) Copy data from the journal to the data device
-            // 2) Increase version on the metadata device
-            // 3) Advance clean_db entry's version, clear previous journal entries
-            // This makes 1 4K small write+sync look like:
-            // 512b+4K (journal) + sync + 512b (journal) + sync + 512b (metadata) + 4K (data) + sync.
-            // WA = 2.375. It's not the best, SSD FTL-like redirect-write with defragmentation
-            // could probably be lower even with defragmentation. But it's fixed and it's still
-            // better than in Ceph. :)
            dirty_it->second.state = ST_J_STABLE;
            // Acknowledge op
            op->retval = 0;
@ -94,15 +115,18 @@ void blockstore::handle_stable_event(ring_data_t *data, blockstore_operation *op
        }
        else if (dirty_it->second.state == ST_D_META_SYNCED)
        {
-            // 1) Copy metadata from the journal to the metadata device
-            // 2) Move dirty_db entry to clean_db and clear previous journal entries
-            // This makes 1 128K big write+sync look like:
-            // 128K (data) + sync + 512b (journal) + sync + 512b (journal) + sync + 512b (metadata) + sync.
-            // WA = 1.012. Very good :)
            dirty_it->second.state = ST_D_STABLE;
            // Acknowledge op
            op->retval = 0;
            op->callback(op);
        }
+        else if (dirty_it->second.state == ST_J_STABLE)
+        {
+            
+        }
+        else if (dirty_it->second.state == ST_D_STABLE)
+        {
+            
+        }
    }
 }
--- a/blockstore_write.cpp
+++ b/blockstore_write.cpp
@ -66,6 +66,11 @@ int blockstore::dequeue_write(blockstore_operation *op)
        );
        op->pending_ops = 1;
        op->min_used_journal_sector = op->max_used_journal_sector = 0;
+        // Remember write as unsynced
+        unsynced_big_writes.push_back((obj_ver_id){
+            .oid = op->oid,
+            .version = op->version,
+        });
    }
    else
    {
@ -111,6 +116,10 @@ int blockstore::dequeue_write(blockstore_operation *op)
        dirty_it->second.state = ST_J_SUBMITTED;
        journal.next_free += op->len;
        op->pending_ops = 2;
+        unsynced_small_writes.push_back((obj_ver_id){
+            .oid = op->oid,
+            .version = op->version,
+        });
    }
    return 1;
 }
@ -155,21 +164,5 @@ void blockstore::handle_write_event(ring_data_t *data, blockstore_operation *op)
        // Acknowledge write without sync
        op->retval = op->len;
        op->callback(op);
-        // Remember write as unsynced
-        // FIXME: Could state change to ST_STABLE? It could break this check
-        if (IS_BIG_WRITE(dirty_entry.state))
-        {
-            unsynced_big_writes.push_back((obj_ver_id){
-                .oid = op->oid,
-                .version = op->version,
-            });
-        }
-        else
-        {
-            unsynced_small_writes.push_back((obj_ver_id){
-                .oid = op->oid,
-                .version = op->version,
-            });
-        }
    }
 }