Implement journaled write throttling for the SSD+HDD case

rdma-zerocopy
Vitaliy Filippov 2021-03-28 22:47:35 +03:00
parent 4694811eab
commit 2ab423d4ef
9 changed files with 172 additions and 85 deletions

View File

@ -3,9 +3,9 @@
#include "blockstore_impl.h" #include "blockstore_impl.h"
blockstore_t::blockstore_t(blockstore_config_t & config, ring_loop_t *ringloop) blockstore_t::blockstore_t(blockstore_config_t & config, ring_loop_t *ringloop, timerfd_manager_t *tfd)
{ {
impl = new blockstore_impl_t(config, ringloop); impl = new blockstore_impl_t(config, ringloop, tfd);
} }
blockstore_t::~blockstore_t() blockstore_t::~blockstore_t()

View File

@ -16,6 +16,7 @@
#include "object_id.h" #include "object_id.h"
#include "ringloop.h" #include "ringloop.h"
#include "timerfd_manager.h"
// Memory alignment for direct I/O (usually 512 bytes) // Memory alignment for direct I/O (usually 512 bytes)
// All other alignments must be a multiple of this one // All other alignments must be a multiple of this one
@ -158,7 +159,7 @@ class blockstore_t
{ {
blockstore_impl_t *impl; blockstore_impl_t *impl;
public: public:
blockstore_t(blockstore_config_t & config, ring_loop_t *ringloop); blockstore_t(blockstore_config_t & config, ring_loop_t *ringloop, timerfd_manager_t *tfd);
~blockstore_t(); ~blockstore_t();
// Event loop // Event loop

View File

@ -3,9 +3,10 @@
#include "blockstore_impl.h" #include "blockstore_impl.h"
blockstore_impl_t::blockstore_impl_t(blockstore_config_t & config, ring_loop_t *ringloop) blockstore_impl_t::blockstore_impl_t(blockstore_config_t & config, ring_loop_t *ringloop, timerfd_manager_t *tfd)
{ {
assert(sizeof(blockstore_op_private_t) <= BS_OP_PRIVATE_DATA_SIZE); assert(sizeof(blockstore_op_private_t) <= BS_OP_PRIVATE_DATA_SIZE);
this->tfd = tfd;
this->ringloop = ringloop; this->ringloop = ringloop;
ring_consumer.loop = [this]() { loop(); }; ring_consumer.loop = [this]() { loop(); };
ringloop->register_consumer(&ring_consumer); ringloop->register_consumer(&ring_consumer);

View File

@ -9,6 +9,7 @@
#include <sys/ioctl.h> #include <sys/ioctl.h>
#include <sys/stat.h> #include <sys/stat.h>
#include <fcntl.h> #include <fcntl.h>
#include <time.h>
#include <unistd.h> #include <unistd.h>
#include <linux/fs.h> #include <linux/fs.h>
@ -158,6 +159,7 @@ struct blockstore_op_private_t
struct iovec iov_zerofill[3]; struct iovec iov_zerofill[3];
// Warning: must not have a default value here because it's written to before calling constructor in blockstore_write.cpp O_o // Warning: must not have a default value here because it's written to before calling constructor in blockstore_write.cpp O_o
uint64_t real_version; uint64_t real_version;
timespec tv_begin;
// Sync // Sync
std::vector<obj_ver_id> sync_big_writes, sync_small_writes; std::vector<obj_ver_id> sync_big_writes, sync_small_writes;
@ -203,6 +205,14 @@ class blockstore_impl_t
unsigned max_flusher_count, min_flusher_count; unsigned max_flusher_count, min_flusher_count;
// Maximum queue depth // Maximum queue depth
unsigned max_write_iodepth = 128; unsigned max_write_iodepth = 128;
// Enable small (journaled) write throttling, useful for the SSD+HDD case
bool throttle_small_writes = false;
// Target data device iops, bandwidth and parallelism for throttling (100/100/1 is the default for HDD)
int throttle_target_iops = 100;
int throttle_target_mbs = 100;
int throttle_target_parallelism = 1;
// Minimum difference in microseconds between target and real execution times to throttle the response
int throttle_threshold_us = 50;
/******* END OF OPTIONS *******/ /******* END OF OPTIONS *******/
struct ring_consumer_t ring_consumer; struct ring_consumer_t ring_consumer;
@ -233,6 +243,7 @@ class blockstore_impl_t
bool live = false, queue_stall = false; bool live = false, queue_stall = false;
ring_loop_t *ringloop; ring_loop_t *ringloop;
timerfd_manager_t *tfd;
bool stop_sync_submitted; bool stop_sync_submitted;
@ -303,7 +314,7 @@ class blockstore_impl_t
public: public:
blockstore_impl_t(blockstore_config_t & config, ring_loop_t *ringloop); blockstore_impl_t(blockstore_config_t & config, ring_loop_t *ringloop, timerfd_manager_t *tfd);
~blockstore_impl_t(); ~blockstore_impl_t();
// Event loop // Event loop

View File

@ -79,6 +79,11 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config)
max_flusher_count = strtoull(config["flusher_count"].c_str(), NULL, 10); max_flusher_count = strtoull(config["flusher_count"].c_str(), NULL, 10);
min_flusher_count = strtoull(config["min_flusher_count"].c_str(), NULL, 10); min_flusher_count = strtoull(config["min_flusher_count"].c_str(), NULL, 10);
max_write_iodepth = strtoull(config["max_write_iodepth"].c_str(), NULL, 10); max_write_iodepth = strtoull(config["max_write_iodepth"].c_str(), NULL, 10);
throttle_small_writes = config["throttle_small_writes"] == "true" || config["throttle_small_writes"] == "1" || config["throttle_small_writes"] == "yes";
throttle_target_iops = strtoull(config["throttle_target_iops"].c_str(), NULL, 10);
throttle_target_mbs = strtoull(config["throttle_target_mbs"].c_str(), NULL, 10);
throttle_target_parallelism = strtoull(config["throttle_target_parallelism"].c_str(), NULL, 10);
throttle_threshold_us = strtoull(config["throttle_threshold_us"].c_str(), NULL, 10);
// Validate // Validate
if (!block_size) if (!block_size)
{ {
@ -180,6 +185,22 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config)
{ {
throw std::runtime_error("immediate_commit=all requires disable_journal_fsync and disable_data_fsync"); throw std::runtime_error("immediate_commit=all requires disable_journal_fsync and disable_data_fsync");
} }
if (!throttle_target_iops)
{
throttle_target_iops = 100;
}
if (!throttle_target_mbs)
{
throttle_target_mbs = 100;
}
if (!throttle_target_parallelism)
{
throttle_target_parallelism = 1;
}
if (!throttle_threshold_us)
{
throttle_threshold_us = 50;
}
// init some fields // init some fields
clean_entry_bitmap_size = block_size / bitmap_granularity / 8; clean_entry_bitmap_size = block_size / bitmap_granularity / 8;
clean_entry_size = sizeof(clean_disk_entry) + 2*clean_entry_bitmap_size; clean_entry_size = sizeof(clean_disk_entry) + 2*clean_entry_bitmap_size;

View File

@ -122,6 +122,8 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
else else
{ {
state = (op->len == block_size || deleted ? BS_ST_BIG_WRITE : BS_ST_SMALL_WRITE); state = (op->len == block_size || deleted ? BS_ST_BIG_WRITE : BS_ST_SMALL_WRITE);
if (state == BS_ST_SMALL_WRITE && throttle_small_writes)
clock_gettime(CLOCK_REALTIME, &PRIV(op)->tv_begin);
if (wait_del) if (wait_del)
state |= BS_ST_WAIT_DEL; state |= BS_ST_WAIT_DEL;
else if (state == BS_ST_SMALL_WRITE && wait_big) else if (state == BS_ST_SMALL_WRITE && wait_big)
@ -424,105 +426,148 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
int blockstore_impl_t::continue_write(blockstore_op_t *op) int blockstore_impl_t::continue_write(blockstore_op_t *op)
{ {
io_uring_sqe *sqe = NULL;
journal_entry_big_write *je;
int op_state = PRIV(op)->op_state; int op_state = PRIV(op)->op_state;
if (op_state != 2 && op_state != 4)
{
// In progress
return 1;
}
auto dirty_it = dirty_db.find((obj_ver_id){
.oid = op->oid,
.version = op->version,
});
assert(dirty_it != dirty_db.end());
if (op_state == 2) if (op_state == 2)
goto resume_2; goto resume_2;
else if (op_state == 4) else if (op_state == 4)
goto resume_4; goto resume_4;
else if (op_state == 6)
goto resume_6;
else
{
// In progress
return 1;
}
resume_2: resume_2:
// Only for the immediate_commit mode: prepare and submit big_write journal entry // Only for the immediate_commit mode: prepare and submit big_write journal entry
BS_SUBMIT_GET_SQE_DECL(sqe); {
je = (journal_entry_big_write*)prefill_single_journal_entry( auto dirty_it = dirty_db.find((obj_ver_id){
journal, op->opcode == BS_OP_WRITE_STABLE ? JE_BIG_WRITE_INSTANT : JE_BIG_WRITE, .oid = op->oid,
sizeof(journal_entry_big_write) + clean_entry_bitmap_size .version = op->version,
); });
dirty_it->second.journal_sector = journal.sector_info[journal.cur_sector].offset; assert(dirty_it != dirty_db.end());
journal.used_sectors[journal.sector_info[journal.cur_sector].offset]++; io_uring_sqe *sqe = NULL;
BS_SUBMIT_GET_SQE_DECL(sqe);
journal_entry_big_write *je = (journal_entry_big_write*)prefill_single_journal_entry(
journal, op->opcode == BS_OP_WRITE_STABLE ? JE_BIG_WRITE_INSTANT : JE_BIG_WRITE,
sizeof(journal_entry_big_write) + clean_entry_bitmap_size
);
dirty_it->second.journal_sector = journal.sector_info[journal.cur_sector].offset;
journal.used_sectors[journal.sector_info[journal.cur_sector].offset]++;
#ifdef BLOCKSTORE_DEBUG #ifdef BLOCKSTORE_DEBUG
printf( printf(
"journal offset %08lx is used by %lx:%lx v%lu (%lu refs)\n", "journal offset %08lx is used by %lx:%lx v%lu (%lu refs)\n",
journal.sector_info[journal.cur_sector].offset, op->oid.inode, op->oid.stripe, op->version, journal.sector_info[journal.cur_sector].offset, op->oid.inode, op->oid.stripe, op->version,
journal.used_sectors[journal.sector_info[journal.cur_sector].offset] journal.used_sectors[journal.sector_info[journal.cur_sector].offset]
); );
#endif #endif
je->oid = op->oid; je->oid = op->oid;
je->version = op->version; je->version = op->version;
je->offset = op->offset; je->offset = op->offset;
je->len = op->len; je->len = op->len;
je->location = dirty_it->second.location; je->location = dirty_it->second.location;
memcpy((void*)(je+1), (clean_entry_bitmap_size > sizeof(void*) ? dirty_it->second.bitmap : &dirty_it->second.bitmap), clean_entry_bitmap_size); memcpy((void*)(je+1), (clean_entry_bitmap_size > sizeof(void*) ? dirty_it->second.bitmap : &dirty_it->second.bitmap), clean_entry_bitmap_size);
je->crc32 = je_crc32((journal_entry*)je); je->crc32 = je_crc32((journal_entry*)je);
journal.crc32_last = je->crc32; journal.crc32_last = je->crc32;
prepare_journal_sector_write(journal, journal.cur_sector, sqe, prepare_journal_sector_write(journal, journal.cur_sector, sqe,
[this, op](ring_data_t *data) { handle_write_event(data, op); }); [this, op](ring_data_t *data) { handle_write_event(data, op); });
PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 1 + journal.cur_sector; PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 1 + journal.cur_sector;
PRIV(op)->pending_ops = 1; PRIV(op)->pending_ops = 1;
PRIV(op)->op_state = 3; PRIV(op)->op_state = 3;
return 1; return 1;
}
resume_4: resume_4:
// Switch object state // Switch object state
#ifdef BLOCKSTORE_DEBUG #ifdef BLOCKSTORE_DEBUG
printf("Ack write %lx:%lx v%lu = state 0x%x\n", op->oid.inode, op->oid.stripe, op->version, dirty_it->second.state); printf("Ack write %lx:%lx v%lu = state 0x%x\n", op->oid.inode, op->oid.stripe, op->version, dirty_it->second.state);
#endif #endif
bool imm = (dirty_it->second.state & BS_ST_TYPE_MASK) == BS_ST_BIG_WRITE
? (immediate_commit == IMMEDIATE_ALL)
: (immediate_commit != IMMEDIATE_NONE);
if (imm)
{ {
auto & unstab = unstable_writes[op->oid]; auto dirty_it = dirty_db.find((obj_ver_id){
unstab = unstab < op->version ? op->version : unstab; .oid = op->oid,
} .version = op->version,
dirty_it->second.state = (dirty_it->second.state & ~BS_ST_WORKFLOW_MASK) });
| (imm ? BS_ST_SYNCED : BS_ST_WRITTEN); assert(dirty_it != dirty_db.end());
if (imm && ((dirty_it->second.state & BS_ST_TYPE_MASK) == BS_ST_DELETE || (dirty_it->second.state & BS_ST_INSTANT))) bool is_big = (dirty_it->second.state & BS_ST_TYPE_MASK) == BS_ST_BIG_WRITE;
{ bool imm = is_big ? (immediate_commit == IMMEDIATE_ALL) : (immediate_commit != IMMEDIATE_NONE);
// Deletions and 'instant' operations are treated as immediately stable if (imm)
mark_stable(dirty_it->first);
}
if (!imm)
{
if ((dirty_it->second.state & BS_ST_TYPE_MASK) == BS_ST_BIG_WRITE)
{ {
// Remember big write as unsynced auto & unstab = unstable_writes[op->oid];
unsynced_big_writes.push_back((obj_ver_id){ unstab = unstab < op->version ? op->version : unstab;
.oid = op->oid,
.version = op->version,
});
} }
else dirty_it->second.state = (dirty_it->second.state & ~BS_ST_WORKFLOW_MASK)
| (imm ? BS_ST_SYNCED : BS_ST_WRITTEN);
if (imm && ((dirty_it->second.state & BS_ST_TYPE_MASK) == BS_ST_DELETE || (dirty_it->second.state & BS_ST_INSTANT)))
{ {
// Remember small write as unsynced // Deletions and 'instant' operations are treated as immediately stable
unsynced_small_writes.push_back((obj_ver_id){ mark_stable(dirty_it->first);
.oid = op->oid,
.version = op->version,
});
} }
} if (!imm)
if (imm && (dirty_it->second.state & BS_ST_TYPE_MASK) == BS_ST_BIG_WRITE)
{
// Unblock small writes
dirty_it++;
while (dirty_it != dirty_db.end() && dirty_it->first.oid == op->oid)
{ {
if ((dirty_it->second.state & BS_ST_WORKFLOW_MASK) == BS_ST_WAIT_BIG) if (is_big)
{ {
dirty_it->second.state = (dirty_it->second.state & ~BS_ST_WORKFLOW_MASK) | BS_ST_IN_FLIGHT; // Remember big write as unsynced
unsynced_big_writes.push_back((obj_ver_id){
.oid = op->oid,
.version = op->version,
});
} }
else
{
// Remember small write as unsynced
unsynced_small_writes.push_back((obj_ver_id){
.oid = op->oid,
.version = op->version,
});
}
}
if (imm && (dirty_it->second.state & BS_ST_TYPE_MASK) == BS_ST_BIG_WRITE)
{
// Unblock small writes
dirty_it++; dirty_it++;
while (dirty_it != dirty_db.end() && dirty_it->first.oid == op->oid)
{
if ((dirty_it->second.state & BS_ST_WORKFLOW_MASK) == BS_ST_WAIT_BIG)
{
dirty_it->second.state = (dirty_it->second.state & ~BS_ST_WORKFLOW_MASK) | BS_ST_IN_FLIGHT;
}
dirty_it++;
}
}
// Apply throttling to not fill the journal too fast for the SSD+HDD case
if (!is_big && throttle_small_writes)
{
// Apply throttling
timespec tv_end;
clock_gettime(CLOCK_REALTIME, &tv_end);
uint64_t exec_us =
(tv_end.tv_sec - PRIV(op)->tv_begin.tv_sec)*1000000 +
(tv_end.tv_nsec - PRIV(op)->tv_begin.tv_nsec)/1000;
// Compare with target execution time
// 100% free -> target time = 0
// 0% free -> target time = iodepth/parallelism * (iops + size/bw) / write per second
uint64_t used_start = journal.get_trim_pos();
uint64_t journal_free_space = journal.next_free < used_start
? (used_start - journal.next_free)
: (journal.len - journal.next_free + used_start - journal.block_size);
uint64_t ref_us =
(write_iodepth <= throttle_target_parallelism ? 100 : 100*write_iodepth/throttle_target_parallelism)
* (1000000/throttle_target_iops + op->len*1000000/throttle_target_mbs/1024/1024)
/ 100;
ref_us -= ref_us * journal_free_space / journal.len;
if (ref_us > exec_us + throttle_threshold_us)
{
// Pause reply
tfd->set_timer_us(ref_us-exec_us, false, [this, op](int timer_id)
{
PRIV(op)->op_state++;
ringloop->wakeup();
});
PRIV(op)->op_state = 5;
return 1;
}
} }
} }
resume_6:
// Acknowledge write // Acknowledge write
op->retval = op->len; op->retval = op->len;
write_iodepth--; write_iodepth--;

View File

@ -25,6 +25,7 @@
// -bs_config='{"data_device":"./test_data.bin"}' -size=1000M // -bs_config='{"data_device":"./test_data.bin"}' -size=1000M
#include "blockstore.h" #include "blockstore.h"
#include "epoll_manager.h"
#include "fio_headers.h" #include "fio_headers.h"
#include "json11/json11.hpp" #include "json11/json11.hpp"
@ -32,6 +33,7 @@
struct bs_data struct bs_data
{ {
blockstore_t *bs; blockstore_t *bs;
epoll_manager_t *epmgr;
ring_loop_t *ringloop; ring_loop_t *ringloop;
/* The list of completed io_u structs. */ /* The list of completed io_u structs. */
std::vector<io_u*> completed; std::vector<io_u*> completed;
@ -104,6 +106,7 @@ static void bs_cleanup(struct thread_data *td)
} }
safe: safe:
delete bsd->bs; delete bsd->bs;
delete bsd->epmgr;
delete bsd->ringloop; delete bsd->ringloop;
delete bsd; delete bsd;
} }
@ -129,7 +132,8 @@ static int bs_init(struct thread_data *td)
} }
} }
bsd->ringloop = new ring_loop_t(512); bsd->ringloop = new ring_loop_t(512);
bsd->bs = new blockstore_t(config, bsd->ringloop); bsd->epmgr = new epoll_manager_t(bsd->ringloop);
bsd->bs = new blockstore_t(config, bsd->ringloop, bsd->epmgr->tfd);
while (1) while (1)
{ {
bsd->ringloop->loop(); bsd->ringloop->loop();

View File

@ -27,14 +27,15 @@ osd_t::osd_t(blockstore_config_t & config, ring_loop_t *ringloop)
this->config = config; this->config = config;
this->ringloop = ringloop; this->ringloop = ringloop;
epmgr = new epoll_manager_t(ringloop);
// FIXME: Use timerfd_interval based directly on io_uring
this->tfd = epmgr->tfd;
// FIXME: Create Blockstore from on-disk superblock config and check it against the OSD cluster config // FIXME: Create Blockstore from on-disk superblock config and check it against the OSD cluster config
this->bs = new blockstore_t(config, ringloop); this->bs = new blockstore_t(config, ringloop, tfd);
parse_config(config); parse_config(config);
epmgr = new epoll_manager_t(ringloop);
this->tfd = epmgr->tfd;
this->tfd->set_timer(print_stats_interval*1000, true, [this](int timer_id) this->tfd->set_timer(print_stats_interval*1000, true, [this](int timer_id)
{ {
print_stats(); print_stats();

View File

@ -3,6 +3,7 @@
#include <malloc.h> #include <malloc.h>
#include "blockstore.h" #include "blockstore.h"
#include "epoll_manager.h"
int main(int narg, char *args[]) int main(int narg, char *args[])
{ {
@ -11,7 +12,8 @@ int main(int narg, char *args[])
config["journal_device"] = "./test_journal.bin"; config["journal_device"] = "./test_journal.bin";
config["data_device"] = "./test_data.bin"; config["data_device"] = "./test_data.bin";
ring_loop_t *ringloop = new ring_loop_t(512); ring_loop_t *ringloop = new ring_loop_t(512);
blockstore_t *bs = new blockstore_t(config, ringloop); epoll_manager_t *epmgr = new epoll_manager_t(ringloop);
blockstore_t *bs = new blockstore_t(config, ringloop, epmgr->tfd);
blockstore_op_t op; blockstore_op_t op;
int main_state = 0; int main_state = 0;
@ -120,6 +122,7 @@ int main(int narg, char *args[])
ringloop->wait(); ringloop->wait();
} }
delete bs; delete bs;
delete epmgr;
delete ringloop; delete ringloop;
return 0; return 0;
} }