Fsync data before writing metadata
parent
a165909745
commit
aa3b252327
|
@ -272,6 +272,7 @@ class blockstore
|
||||||
uint64_t data_offset, data_size, data_len;
|
uint64_t data_offset, data_size, data_len;
|
||||||
|
|
||||||
bool readonly = false;
|
bool readonly = false;
|
||||||
|
// FIXME: separate flags for data, metadata and journal
|
||||||
bool disable_fsync = false;
|
bool disable_fsync = false;
|
||||||
bool inmemory_meta = false;
|
bool inmemory_meta = false;
|
||||||
void *metadata_buffer = NULL;
|
void *metadata_buffer = NULL;
|
||||||
|
|
|
@ -5,7 +5,6 @@ journal_flusher_t::journal_flusher_t(int flusher_count, blockstore *bs)
|
||||||
this->bs = bs;
|
this->bs = bs;
|
||||||
this->flusher_count = flusher_count;
|
this->flusher_count = flusher_count;
|
||||||
active_flushers = 0;
|
active_flushers = 0;
|
||||||
active_until_sync = 0;
|
|
||||||
sync_threshold = flusher_count == 1 ? 1 : flusher_count/2;
|
sync_threshold = flusher_count == 1 ? 1 : flusher_count/2;
|
||||||
journal_trim_interval = sync_threshold;
|
journal_trim_interval = sync_threshold;
|
||||||
journal_trim_counter = 0;
|
journal_trim_counter = 0;
|
||||||
|
@ -133,8 +132,6 @@ bool journal_flusher_co::loop()
|
||||||
goto resume_9;
|
goto resume_9;
|
||||||
else if (wait_state == 10)
|
else if (wait_state == 10)
|
||||||
goto resume_10;
|
goto resume_10;
|
||||||
else if (wait_state == 11)
|
|
||||||
goto resume_11;
|
|
||||||
else if (wait_state == 12)
|
else if (wait_state == 12)
|
||||||
goto resume_12;
|
goto resume_12;
|
||||||
else if (wait_state == 13)
|
else if (wait_state == 13)
|
||||||
|
@ -143,6 +140,12 @@ bool journal_flusher_co::loop()
|
||||||
goto resume_14;
|
goto resume_14;
|
||||||
else if (wait_state == 15)
|
else if (wait_state == 15)
|
||||||
goto resume_15;
|
goto resume_15;
|
||||||
|
else if (wait_state == 16)
|
||||||
|
goto resume_16;
|
||||||
|
else if (wait_state == 17)
|
||||||
|
goto resume_17;
|
||||||
|
else if (wait_state == 18)
|
||||||
|
goto resume_18;
|
||||||
resume_0:
|
resume_0:
|
||||||
if (!flusher->flush_queue.size())
|
if (!flusher->flush_queue.size())
|
||||||
{
|
{
|
||||||
|
@ -177,7 +180,6 @@ resume_0:
|
||||||
#endif
|
#endif
|
||||||
dirty_it = dirty_end;
|
dirty_it = dirty_end;
|
||||||
flusher->active_flushers++;
|
flusher->active_flushers++;
|
||||||
flusher->active_until_sync++;
|
|
||||||
v.clear();
|
v.clear();
|
||||||
wait_count = 0;
|
wait_count = 0;
|
||||||
copy_count = 0;
|
copy_count = 0;
|
||||||
|
@ -261,7 +263,6 @@ resume_0:
|
||||||
{
|
{
|
||||||
// Nothing to flush
|
// Nothing to flush
|
||||||
flusher->active_flushers--;
|
flusher->active_flushers--;
|
||||||
flusher->active_until_sync--;
|
|
||||||
repeat_it = flusher->sync_to_repeat.find(cur.oid);
|
repeat_it = flusher->sync_to_repeat.find(cur.oid);
|
||||||
if (repeat_it->second > cur.version)
|
if (repeat_it->second > cur.version)
|
||||||
{
|
{
|
||||||
|
@ -339,11 +340,17 @@ resume_0:
|
||||||
);
|
);
|
||||||
wait_count++;
|
wait_count++;
|
||||||
}
|
}
|
||||||
|
// Sync data before writing metadata
|
||||||
|
resume_16:
|
||||||
|
resume_17:
|
||||||
|
resume_18:
|
||||||
|
if (copy_count && !fsync_batch(false, 16))
|
||||||
|
{
|
||||||
|
wait_state += 16;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
resume_5:
|
resume_5:
|
||||||
// And metadata writes, but only after data writes complete
|
// And metadata writes, but only after data writes complete
|
||||||
// FIXME: Oops. We must sync data before writing metadata.
|
|
||||||
// Because this will result in more fsyncs we should reduce flushers activity by postponing
|
|
||||||
// flushes until there is sufficient amount of work to do (for example, at least ~32 operations in the queue).
|
|
||||||
if (!bs->inmemory_meta && meta_new.it->second.state == 0 || wait_count > 0)
|
if (!bs->inmemory_meta && meta_new.it->second.state == 0 || wait_count > 0)
|
||||||
{
|
{
|
||||||
// metadata sector is still being read or data is still being written, wait for it
|
// metadata sector is still being read or data is still being written, wait for it
|
||||||
|
@ -409,14 +416,13 @@ resume_0:
|
||||||
free(it->buf);
|
free(it->buf);
|
||||||
}
|
}
|
||||||
v.clear();
|
v.clear();
|
||||||
// And sync everything (in batches - not per each operation!)
|
// And sync metadata (in batches - not per each operation!)
|
||||||
flusher->active_until_sync--;
|
|
||||||
resume_8:
|
resume_8:
|
||||||
resume_9:
|
resume_9:
|
||||||
resume_10:
|
resume_10:
|
||||||
resume_11:
|
if (!fsync_batch(true, 8))
|
||||||
if (!fsync_batch())
|
|
||||||
{
|
{
|
||||||
|
wait_state += 8;
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
// Update clean_db and dirty_db, free old data locations
|
// Update clean_db and dirty_db, free old data locations
|
||||||
|
@ -568,56 +574,54 @@ void journal_flusher_co::update_clean_db()
|
||||||
bs->dirty_db.erase(dirty_it, std::next(dirty_end));
|
bs->dirty_db.erase(dirty_it, std::next(dirty_end));
|
||||||
}
|
}
|
||||||
|
|
||||||
bool journal_flusher_co::fsync_batch()
|
bool journal_flusher_co::fsync_batch(bool fsync_meta, int wait_base)
|
||||||
{
|
{
|
||||||
if (wait_state == 8)
|
if (wait_state == wait_base)
|
||||||
goto resume_8;
|
goto resume_0;
|
||||||
else if (wait_state == 9)
|
else if (wait_state == wait_base+1)
|
||||||
goto resume_9;
|
goto resume_1;
|
||||||
else if (wait_state == 10)
|
else if (wait_state == wait_base+2)
|
||||||
goto resume_10;
|
goto resume_2;
|
||||||
else if (wait_state == 11)
|
|
||||||
goto resume_11;
|
|
||||||
if (!bs->disable_fsync)
|
if (!bs->disable_fsync)
|
||||||
{
|
{
|
||||||
cur_sync = flusher->syncs.end();
|
cur_sync = flusher->syncs.end();
|
||||||
if (cur_sync == flusher->syncs.begin() || cur_sync->state == 1)
|
while (cur_sync != flusher->syncs.begin())
|
||||||
cur_sync = flusher->syncs.emplace(flusher->syncs.end(), (flusher_sync_t){ .ready_count = 0, .state = 0 });
|
{
|
||||||
else
|
|
||||||
cur_sync--;
|
cur_sync--;
|
||||||
|
if (cur_sync->fsync_meta == fsync_meta && cur_sync->state == 0)
|
||||||
|
goto sync_found;
|
||||||
|
}
|
||||||
|
cur_sync = flusher->syncs.emplace(flusher->syncs.end(), (flusher_sync_t){
|
||||||
|
.fsync_meta = fsync_meta,
|
||||||
|
.ready_count = 0,
|
||||||
|
.state = 0,
|
||||||
|
});
|
||||||
|
sync_found:
|
||||||
cur_sync->ready_count++;
|
cur_sync->ready_count++;
|
||||||
if (cur_sync->ready_count >= flusher->sync_threshold ||
|
if (cur_sync->ready_count >= flusher->sync_threshold || !flusher->flush_queue.size())
|
||||||
!flusher->active_until_sync && (!flusher->flush_queue.size() || flusher->active_flushers >= flusher->flusher_count))
|
|
||||||
{
|
{
|
||||||
// Sync batch is ready. Do it.
|
// Sync batch is ready. Do it.
|
||||||
await_sqe(9);
|
await_sqe(0);
|
||||||
data->callback = simple_callback_w;
|
data->callback = simple_callback_w;
|
||||||
data->iov = { 0 };
|
data->iov = { 0 };
|
||||||
my_uring_prep_fsync(sqe, bs->data_fd, IORING_FSYNC_DATASYNC);
|
my_uring_prep_fsync(sqe, fsync_meta ? bs->meta_fd : bs->data_fd, IORING_FSYNC_DATASYNC);
|
||||||
|
cur_sync->state = 1;
|
||||||
wait_count++;
|
wait_count++;
|
||||||
if (bs->meta_fd != bs->data_fd)
|
resume_1:
|
||||||
{
|
|
||||||
await_sqe(10);
|
|
||||||
data->callback = simple_callback_w;
|
|
||||||
data->iov = { 0 };
|
|
||||||
my_uring_prep_fsync(sqe, bs->meta_fd, IORING_FSYNC_DATASYNC);
|
|
||||||
wait_count++;
|
|
||||||
}
|
|
||||||
resume_11:
|
|
||||||
if (wait_count > 0)
|
if (wait_count > 0)
|
||||||
{
|
{
|
||||||
wait_state = 11;
|
wait_state = 1;
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
// Sync completed. All previous coroutines waiting for it must be resumed
|
// Sync completed. All previous coroutines waiting for it must be resumed
|
||||||
cur_sync->state = 1;
|
cur_sync->state = 2;
|
||||||
bs->ringloop->wakeup(bs->ring_consumer);
|
bs->ringloop->wakeup(bs->ring_consumer);
|
||||||
}
|
}
|
||||||
// Wait until someone else sends and completes a sync.
|
// Wait until someone else sends and completes a sync.
|
||||||
resume_8:
|
resume_2:
|
||||||
if (!cur_sync->state)
|
if (!cur_sync->state)
|
||||||
{
|
{
|
||||||
wait_state = 8;
|
wait_state = 2;
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
cur_sync->ready_count--;
|
cur_sync->ready_count--;
|
||||||
|
|
|
@ -14,6 +14,7 @@ struct meta_sector_t
|
||||||
|
|
||||||
struct flusher_sync_t
|
struct flusher_sync_t
|
||||||
{
|
{
|
||||||
|
bool fsync_meta;
|
||||||
int ready_count;
|
int ready_count;
|
||||||
int state;
|
int state;
|
||||||
};
|
};
|
||||||
|
@ -50,7 +51,7 @@ class journal_flusher_co
|
||||||
friend class journal_flusher_t;
|
friend class journal_flusher_t;
|
||||||
bool modify_meta_read(uint64_t meta_loc, flusher_meta_write_t &wr, int wait_base);
|
bool modify_meta_read(uint64_t meta_loc, flusher_meta_write_t &wr, int wait_base);
|
||||||
void update_clean_db();
|
void update_clean_db();
|
||||||
bool fsync_batch();
|
bool fsync_batch(bool fsync_meta, int wait_base);
|
||||||
public:
|
public:
|
||||||
journal_flusher_co();
|
journal_flusher_co();
|
||||||
bool loop();
|
bool loop();
|
||||||
|
@ -68,7 +69,7 @@ class journal_flusher_t
|
||||||
int journal_trim_counter, journal_trim_interval;
|
int journal_trim_counter, journal_trim_interval;
|
||||||
void* journal_superblock;
|
void* journal_superblock;
|
||||||
|
|
||||||
int active_flushers, active_until_sync;
|
int active_flushers;
|
||||||
std::list<flusher_sync_t> syncs;
|
std::list<flusher_sync_t> syncs;
|
||||||
std::map<object_id, uint64_t> sync_to_repeat;
|
std::map<object_id, uint64_t> sync_to_repeat;
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue