From d2d8d6e7fb791e179773217f1e53d9af47aacc0e Mon Sep 17 00:00:00 2001
From: Vitaliy Filippov <vitalif@yourcmc.ru>
Date: Mon, 11 Nov 2019 02:53:19 +0300
Subject: [PATCH] Comments about stabilize operation, track unsynced_writes
 correctly

---
 blockstore.h          |  5 +++--
 blockstore_read.cpp   | 27 +++++++++++++++-------
 blockstore_stable.cpp | 52 +++++++++++++++++++++++++++++++------------
 blockstore_write.cpp  | 25 ++++++++-------------
 4 files changed, 69 insertions(+), 40 deletions(-)

diff --git a/blockstore.h b/blockstore.h
index 25706cf5..8ac61235 100644
--- a/blockstore.h
+++ b/blockstore.h
@@ -275,9 +275,9 @@ class blockstore
 
     // Read
     int dequeue_read(blockstore_operation *read_op);
-    int fulfill_read(blockstore_operation *read_op, uint32_t item_start, uint32_t item_end,
+    int fulfill_read(blockstore_operation *read_op, uint64_t &fulfilled, uint32_t item_start, uint32_t item_end,
         uint32_t item_state, uint64_t item_version, uint64_t item_location);
-    int fulfill_read_push(blockstore_operation *read_op, uint32_t item_start,
+    int fulfill_read_push(blockstore_operation *read_op, uint64_t &fulfilled, uint32_t item_start,
         uint32_t item_state, uint64_t item_version, uint64_t item_location, uint32_t cur_start, uint32_t cur_end);
     void handle_read_event(ring_data_t *data, blockstore_operation *op);
 
@@ -294,6 +294,7 @@ class blockstore
 
     // Stable
     int dequeue_stable(blockstore_operation *op);
+    int continue_stable(blockstore_operation *op);
     void handle_stable_event(ring_data_t *data, blockstore_operation *op);
 
 public:
diff --git a/blockstore_read.cpp b/blockstore_read.cpp
index c7b975d1..d62c0703 100644
--- a/blockstore_read.cpp
+++ b/blockstore_read.cpp
@@ -1,6 +1,6 @@
 #include "blockstore.h"
 
-int blockstore::fulfill_read_push(blockstore_operation *op, uint32_t item_start,
+int blockstore::fulfill_read_push(blockstore_operation *op, uint64_t &fulfilled, uint32_t item_start,
     uint32_t item_state, uint64_t item_version, uint64_t item_location, uint32_t cur_start, uint32_t cur_end)
 {
     if (cur_end > cur_start)
@@ -31,11 +31,12 @@ int blockstore::fulfill_read_push(blockstore_operation *op, uint32_t item_start,
             (IS_JOURNAL(item_state) ? journal.offset : data_offset) + item_location + cur_start - item_start
         );
         data->op = op;
+        fulfilled += cur_end-cur_start;
     }
     return 1;
 }
 
-int blockstore::fulfill_read(blockstore_operation *read_op, uint32_t item_start, uint32_t item_end,
+int blockstore::fulfill_read(blockstore_operation *read_op, uint64_t &fulfilled, uint32_t item_start, uint32_t item_end,
     uint32_t item_state, uint64_t item_version, uint64_t item_location)
 {
     uint32_t cur_start = item_start;
@@ -54,14 +55,14 @@ int blockstore::fulfill_read(blockstore_operation *read_op, uint32_t item_start,
         }
         while (fulfill_near != read_op->read_vec.end() && fulfill_near->first < item_end)
         {
-            if (!fulfill_read_push(read_op, item_start, item_state, item_version, item_location, cur_start, fulfill_near->first))
+            if (!fulfill_read_push(read_op, fulfilled, item_start, item_state, item_version, item_location, cur_start, fulfill_near->first))
             {
                 return 0;
             }
             cur_start = fulfill_near->first + fulfill_near->second.iov_len;
             fulfill_near++;
         }
-        if (!fulfill_read_push(read_op, item_start, item_state, item_version, item_location, cur_start, item_end))
+        if (!fulfill_read_push(read_op, fulfilled, item_start, item_state, item_version, item_location, cur_start, item_end))
         {
             return 0;
         }
@@ -87,16 +88,22 @@ int blockstore::dequeue_read(blockstore_operation *read_op)
         read_op->callback(read_op);
         return 1;
     }
-    // FIXME track fulfilled and stop when it is equal to read_op->len
     uint64_t fulfilled = 0;
     if (dirty_found)
     {
         while (dirty_it->first.oid == read_op->oid)
         {
             dirty_entry& dirty = dirty_it->second;
-            if (IS_STABLE(dirty.state) || read_op->version >= dirty_it->first.version)
+            bool version_ok = read_op->version >= dirty_it->first.version;
+            if (IS_STABLE(dirty.state))
             {
-                if (!fulfill_read(read_op, dirty.offset, dirty.offset + dirty.size,
+                if (!version_ok && read_op->version != 0)
+                    read_op->version = dirty_it->first.version;
+                version_ok = true;
+            }
+            if (version_ok)
+            {
+                if (!fulfill_read(read_op, fulfilled, dirty.offset, dirty.offset + dirty.size,
                     dirty.state, dirty_it->first.version, dirty.location))
                 {
                     // need to wait. undo added requests, don't dequeue op
@@ -104,12 +111,16 @@ int blockstore::dequeue_read(blockstore_operation *read_op)
                     return 0;
                 }
             }
+            if (fulfilled == read_op->len)
+            {
+                break;
+            }
             dirty_it--;
         }
     }
     if (clean_it != clean_db.end())
     {
-        if (!fulfill_read(read_op, 0, block_size, ST_CURRENT, 0, clean_it->second.location))
+        if (!fulfill_read(read_op, fulfilled, 0, block_size, ST_CURRENT, 0, clean_it->second.location))
         {
             // need to wait. undo added requests, don't dequeue op
             read_op->read_vec.clear();
diff --git a/blockstore_stable.cpp b/blockstore_stable.cpp
index a27aa218..ea2594be 100644
--- a/blockstore_stable.cpp
+++ b/blockstore_stable.cpp
@@ -1,5 +1,29 @@
 #include "blockstore.h"
 
+// Stabilize small write:
+// 1) Copy data from the journal to the data device
+//    Sync it before writing metadata if we want to keep metadata consistent
+//    Overall it's optional because it can be replayed from the journal until
+//    it's cleared, and reads are also fulfilled from the journal
+// 2) Increase version on the metadata device and sync it
+// 3) Advance clean_db entry's version, clear previous journal entries
+//
+// This makes 1 4K small write+sync look like:
+// 512b+4K (journal) + sync + 512b (journal) + sync + 4K (data) [+ sync?] + 512b (metadata) + sync.
+// WA = 2.375. It's not the best, SSD FTL-like redirect-write with defragmentation
+// could probably be lower even with defragmentation. But it's fixed and it's still
+// better than in Ceph. :)
+
+// Stabilize big write:
+// 1) Copy metadata from the journal to the metadata device
+// 2) Move dirty_db entry to clean_db and clear previous journal entries
+//
+// This makes 1 128K big write+sync look like:
+// 128K (data) + sync + 512b (journal) + sync + 512b (journal) + sync + 512b (metadata) + sync.
+// WA = 1.012. Very good :)
+
+// AND We must do it in batches, for the sake of reduced fsync call count
+
 int blockstore::dequeue_stable(blockstore_operation *op)
 {
     auto dirty_it = dirty_db.find((obj_ver_id){
@@ -61,6 +85,11 @@ int blockstore::dequeue_stable(blockstore_operation *op)
     return 1;
 }
 
+int blockstore::continue_stable(blockstore_operation *op)
+{
+    return 0;
+}
+
 void blockstore::handle_stable_event(ring_data_t *data, blockstore_operation *op)
 {
     if (data->res < 0)
@@ -72,21 +101,13 @@ void blockstore::handle_stable_event(ring_data_t *data, blockstore_operation *op
     op->pending_ops--;
     if (op->pending_ops == 0)
     {
-        // Mark dirty_db entry as stable
+        // Mark all dirty_db entries up to op->version as stable
         auto dirty_it = dirty_db.find((obj_ver_id){
             .oid = op->oid,
             .version = op->version,
         });
         if (dirty_it->second.state == ST_J_SYNCED)
         {
-            // 1) Copy data from the journal to the data device
-            // 2) Increase version on the metadata device
-            // 3) Advance clean_db entry's version, clear previous journal entries
-            // This makes 1 4K small write+sync look like:
-            // 512b+4K (journal) + sync + 512b (journal) + sync + 512b (metadata) + 4K (data) + sync.
-            // WA = 2.375. It's not the best, SSD FTL-like redirect-write with defragmentation
-            // could probably be lower even with defragmentation. But it's fixed and it's still
-            // better than in Ceph. :)
             dirty_it->second.state = ST_J_STABLE;
             // Acknowledge op
             op->retval = 0;
@@ -94,15 +115,18 @@ void blockstore::handle_stable_event(ring_data_t *data, blockstore_operation *op
         }
         else if (dirty_it->second.state == ST_D_META_SYNCED)
         {
-            // 1) Copy metadata from the journal to the metadata device
-            // 2) Move dirty_db entry to clean_db and clear previous journal entries
-            // This makes 1 128K big write+sync look like:
-            // 128K (data) + sync + 512b (journal) + sync + 512b (journal) + sync + 512b (metadata) + sync.
-            // WA = 1.012. Very good :)
             dirty_it->second.state = ST_D_STABLE;
             // Acknowledge op
             op->retval = 0;
             op->callback(op);
         }
+        else if (dirty_it->second.state == ST_J_STABLE)
+        {
+            
+        }
+        else if (dirty_it->second.state == ST_D_STABLE)
+        {
+            
+        }
     }
 }
diff --git a/blockstore_write.cpp b/blockstore_write.cpp
index 703ff8ec..9b08c8c9 100644
--- a/blockstore_write.cpp
+++ b/blockstore_write.cpp
@@ -66,6 +66,11 @@ int blockstore::dequeue_write(blockstore_operation *op)
         );
         op->pending_ops = 1;
         op->min_used_journal_sector = op->max_used_journal_sector = 0;
+        // Remember write as unsynced
+        unsynced_big_writes.push_back((obj_ver_id){
+            .oid = op->oid,
+            .version = op->version,
+        });
     }
     else
     {
@@ -111,6 +116,10 @@ int blockstore::dequeue_write(blockstore_operation *op)
         dirty_it->second.state = ST_J_SUBMITTED;
         journal.next_free += op->len;
         op->pending_ops = 2;
+        unsynced_small_writes.push_back((obj_ver_id){
+            .oid = op->oid,
+            .version = op->version,
+        });
     }
     return 1;
 }
@@ -155,21 +164,5 @@ void blockstore::handle_write_event(ring_data_t *data, blockstore_operation *op)
         // Acknowledge write without sync
         op->retval = op->len;
         op->callback(op);
-        // Remember write as unsynced
-        // FIXME: Could state change to ST_STABLE? It could break this check
-        if (IS_BIG_WRITE(dirty_entry.state))
-        {
-            unsynced_big_writes.push_back((obj_ver_id){
-                .oid = op->oid,
-                .version = op->version,
-            });
-        }
-        else
-        {
-            unsynced_small_writes.push_back((obj_ver_id){
-                .oid = op->oid,
-                .version = op->version,
-            });
-        }
     }
 }