Autosync based on number of unstable ops to prevent journal stalls

2021-10-30 14:26:48 +03:00 · 2021-10-30 14:26:48 +03:00 · cfe8de9b84
parent 24b9b19066
commit cfe8de9b84
9 changed files with 36 additions and 3 deletions
--- a/mon/mon.js
+++ b/mon/mon.js
@ -87,6 +87,7 @@ const etcd_tree = {
            bind_address: "0.0.0.0",
            bind_port: 0,
            autosync_interval: 5,
+            autosync_writes: 128,
            client_queue_depth: 128, // unused
            recovery_queue_depth: 4,
            recovery_sync_batch: 16,
--- a/src/blockstore.cpp
+++ b/src/blockstore.cpp
@ -68,6 +68,11 @@ uint64_t blockstore_t::get_free_block_count()
    return impl->get_free_block_count();
 }

+uint64_t blockstore_t::get_journal_size()
+{
+    return impl->get_journal_size();
+}
+
 uint32_t blockstore_t::get_bitmap_granularity()
 {
    return impl->get_bitmap_granularity();
--- a/src/blockstore.h
+++ b/src/blockstore.h
@ -194,5 +194,7 @@ public:
    uint64_t get_block_count();
    uint64_t get_free_block_count();

+    uint64_t get_journal_size();
+
    uint32_t get_bitmap_granularity();
 };
--- a/src/blockstore_impl.h
+++ b/src/blockstore_impl.h
@ -368,4 +368,5 @@ public:
    inline uint64_t get_block_count() { return block_count; }
    inline uint64_t get_free_block_count() { return data_alloc->get_free_count(); }
    inline uint32_t get_bitmap_granularity() { return disk_alignment; }
+    inline uint64_t get_journal_size() { return journal.len; }
 };
--- a/src/osd.cpp
+++ b/src/osd.cpp
@ -45,6 +45,12 @@ osd_t::osd_t(const json11::Json & config, ring_loop_t *ringloop)
    // FIXME: Create Blockstore from on-disk superblock config and check it against the OSD cluster config
    auto bs_cfg = json_to_bs(this->config);
    this->bs = new blockstore_t(bs_cfg, ringloop, tfd);
+    {
+        // Autosync based on the number of unstable writes to prevent stalls due to insufficient journal space
+        uint64_t max_autosync = bs->get_journal_size() / bs->get_block_size() / 2;
+        if (autosync_writes > max_autosync)
+            autosync_writes = max_autosync;
+    }

    this->tfd->set_timer(print_stats_interval*1000, true, [this](int timer_id)
    {
@ -123,6 +129,11 @@ void osd_t::parse_config(const json11::Json & config)
        if (autosync_interval > MAX_AUTOSYNC_INTERVAL)
            autosync_interval = DEFAULT_AUTOSYNC_INTERVAL;
    }
+    if (!config["autosync_writes"].is_null())
+    {
+        // Allow to set it to 0
+        autosync_writes = config["autosync_writes"].uint64_value();
+    }
    if (!config["client_queue_depth"].is_null())
    {
        client_queue_depth = config["client_queue_depth"].uint64_value();
--- a/src/osd.h
+++ b/src/osd.h
@ -35,6 +35,7 @@

 #define MAX_AUTOSYNC_INTERVAL 3600
 #define DEFAULT_AUTOSYNC_INTERVAL 5
+#define DEFAULT_AUTOSYNC_WRITES 128
 #define MAX_RECOVERY_QUEUE 2048
 #define DEFAULT_RECOVERY_QUEUE 4
 #define DEFAULT_RECOVERY_BATCH 16
@ -108,7 +109,8 @@ class osd_t
    int print_stats_interval = 3;
    int slow_log_interval = 10;
    int immediate_commit = IMMEDIATE_NONE;
-    int autosync_interval = DEFAULT_AUTOSYNC_INTERVAL; // sync every 5 seconds
+    int autosync_interval = DEFAULT_AUTOSYNC_INTERVAL; // "emergency" sync every 5 seconds
+    int autosync_writes = DEFAULT_AUTOSYNC_WRITES;
    int recovery_queue_depth = DEFAULT_RECOVERY_QUEUE;
    int recovery_sync_batch = DEFAULT_RECOVERY_BATCH;
    int log_level = 0;
@ -140,6 +142,7 @@ class osd_t
    osd_op_t *autosync_op = NULL;

    // Unstable writes
+    uint64_t unstable_write_count = 0;
    std::map<osd_object_id_t, uint64_t> unstable_writes;
    std::deque<osd_op_t*> syncs_in_progress;

--- a/src/osd_primary_subops.cpp
+++ b/src/osd_primary_subops.cpp
@ -5,8 +5,6 @@

 void osd_t::autosync()
 {
-    // FIXME Autosync based on the number of unstable writes to prevent
-    // "journal_sector_buffer_count is too low for this batch" errors
    if (immediate_commit != IMMEDIATE_ALL && !autosync_op)
    {
        autosync_op = new osd_op_t();
--- a/src/osd_primary_write.cpp
+++ b/src/osd_primary_write.cpp
@ -274,6 +274,11 @@ continue_others:
    }
    // finish_op would invalidate next_it if it cleared pg.write_queue, but it doesn't do that :)
    finish_op(cur_op, cur_op->reply.hdr.retval);
+    if (unstable_write_count >= autosync_writes)
+    {
+        unstable_write_count = 0;
+        autosync();
+    }
    if (next_op)
    {
        // Continue next write to the same object
@ -353,6 +358,7 @@ resume_7:
    else
    {
 lazy:
+        unstable_write_count++;
        if (op_data->scheme != POOL_SCHEME_REPLICATED)
        {
            // Remember version as unstable for EC/XOR
--- a/tests/test_write.sh
+++ b/tests/test_write.sh
@ -5,6 +5,12 @@
 #LD_PRELOAD=libasan.so.5 \
 #    fio -thread -name=test -ioengine=build/src/libfio_vitastor_sec.so -bs=4k -fsync=128 `$ETCDCTL get /vitastor/osd/state/1 --print-value-only | jq -r '"-host="+.addresses[0]+" -port="+(.port|tostring)'` -rw=write -size=32M

+# Random writes without immediate_commit were stalling OSDs
+
+LD_PRELOAD=libasan.so.5 \
+    fio -thread -name=test -ioengine=build/src/libfio_vitastor.so -bs=124k -direct=1 -numjobs=16 -iodepth=4 \
+        -rw=randwrite -etcd=$ETCD_URL -pool=1 -inode=1 -size=128M -runtime=10
+
 # A lot of parallel syncs was crashing the primary OSD at some point

 LD_PRELOAD=libasan.so.5 \