Add journal fsync to stabilize/rollback

trace-sqes
Vitaliy Filippov 2020-03-09 00:35:54 +03:00
parent c863543bfe
commit c3737ae3ff
7 changed files with 193 additions and 103 deletions

View File

@ -6,6 +6,7 @@ journal_flusher_t::journal_flusher_t(int flusher_count, blockstore_impl_t *bs)
this->flusher_count = flusher_count; this->flusher_count = flusher_count;
dequeuing = false; dequeuing = false;
active_flushers = 0; active_flushers = 0;
syncing_flushers = 0;
sync_threshold = bs->journal_block_size / sizeof(journal_entry_stable); sync_threshold = bs->journal_block_size / sizeof(journal_entry_stable);
journal_trim_interval = sync_threshold; journal_trim_interval = sync_threshold;
journal_trim_counter = 0; journal_trim_counter = 0;
@ -649,7 +650,8 @@ bool journal_flusher_co::fsync_batch(bool fsync_meta, int wait_base)
}); });
sync_found: sync_found:
cur_sync->ready_count++; cur_sync->ready_count++;
if (cur_sync->ready_count >= flusher->sync_threshold || !flusher->flush_queue.size()) flusher->syncing_flushers++;
if (flusher->syncing_flushers >= flusher->flusher_count || !flusher->flush_queue.size())
{ {
// Sync batch is ready. Do it. // Sync batch is ready. Do it.
await_sqe(0); await_sqe(0);
@ -675,6 +677,7 @@ bool journal_flusher_co::fsync_batch(bool fsync_meta, int wait_base)
wait_state = 2; wait_state = 2;
return false; return false;
} }
flusher->syncing_flushers--;
cur_sync->ready_count--; cur_sync->ready_count--;
if (cur_sync->ready_count == 0) if (cur_sync->ready_count == 0)
{ {

View File

@ -84,6 +84,7 @@ class journal_flusher_t
void* journal_superblock; void* journal_superblock;
int active_flushers; int active_flushers;
int syncing_flushers;
std::list<flusher_sync_t> syncs; std::list<flusher_sync_t> syncs;
std::map<object_id, uint64_t> sync_to_repeat; std::map<object_id, uint64_t> sync_to_repeat;

View File

@ -364,7 +364,7 @@ void blockstore_impl_t::enqueue_op(blockstore_op_t *op, bool first)
// Call constructor without allocating memory. We'll call destructor before returning op back // Call constructor without allocating memory. We'll call destructor before returning op back
new ((void*)op->private_data) blockstore_op_private_t; new ((void*)op->private_data) blockstore_op_private_t;
PRIV(op)->wait_for = 0; PRIV(op)->wait_for = 0;
PRIV(op)->sync_state = 0; PRIV(op)->op_state = 0;
PRIV(op)->pending_ops = 0; PRIV(op)->pending_ops = 0;
if (!first) if (!first)
{ {

View File

@ -147,6 +147,7 @@ struct blockstore_op_private_t
int wait_for; int wait_for;
uint64_t wait_detail; uint64_t wait_detail;
int pending_ops; int pending_ops;
int op_state;
// Read // Read
std::vector<fulfill_read_t> read_vec; std::vector<fulfill_read_t> read_vec;
@ -161,7 +162,7 @@ struct blockstore_op_private_t
std::vector<obj_ver_id> sync_big_writes, sync_small_writes; std::vector<obj_ver_id> sync_big_writes, sync_small_writes;
int sync_small_checked, sync_big_checked; int sync_small_checked, sync_big_checked;
std::list<blockstore_op_t*>::iterator in_progress_ptr; std::list<blockstore_op_t*>::iterator in_progress_ptr;
int sync_state, prev_sync_count; int prev_sync_count;
}; };
// https://github.com/algorithm-ninja/cpp-btree // https://github.com/algorithm-ninja/cpp-btree
@ -280,11 +281,13 @@ class blockstore_impl_t
// Stabilize // Stabilize
int dequeue_stable(blockstore_op_t *op); int dequeue_stable(blockstore_op_t *op);
int continue_stable(blockstore_op_t *op);
void handle_stable_event(ring_data_t *data, blockstore_op_t *op); void handle_stable_event(ring_data_t *data, blockstore_op_t *op);
void stabilize_object(object_id oid, uint64_t max_ver); void stabilize_object(object_id oid, uint64_t max_ver);
// Rollback // Rollback
int dequeue_rollback(blockstore_op_t *op); int dequeue_rollback(blockstore_op_t *op);
int continue_rollback(blockstore_op_t *op);
void handle_rollback_event(ring_data_t *data, blockstore_op_t *op); void handle_rollback_event(ring_data_t *data, blockstore_op_t *op);
void erase_dirty(blockstore_dirty_db_t::iterator dirty_start, blockstore_dirty_db_t::iterator dirty_end, uint64_t clean_loc); void erase_dirty(blockstore_dirty_db_t::iterator dirty_start, blockstore_dirty_db_t::iterator dirty_end, uint64_t clean_loc);

View File

@ -2,6 +2,10 @@
int blockstore_impl_t::dequeue_rollback(blockstore_op_t *op) int blockstore_impl_t::dequeue_rollback(blockstore_op_t *op)
{ {
if (PRIV(op)->op_state)
{
return continue_rollback(op);
}
obj_ver_id* v; obj_ver_id* v;
int i, todo = op->len; int i, todo = op->len;
for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++) for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++)
@ -110,24 +114,41 @@ int blockstore_impl_t::dequeue_rollback(blockstore_op_t *op)
} }
PRIV(op)->max_flushed_journal_sector = 1 + journal.cur_sector; PRIV(op)->max_flushed_journal_sector = 1 + journal.cur_sector;
PRIV(op)->pending_ops = s; PRIV(op)->pending_ops = s;
PRIV(op)->op_state = 1;
return 1; return 1;
} }
void blockstore_impl_t::handle_rollback_event(ring_data_t *data, blockstore_op_t *op) int blockstore_impl_t::continue_rollback(blockstore_op_t *op)
{ {
live = true; if (PRIV(op)->op_state == 2)
if (data->res != data->iov.iov_len) goto resume_2;
{ else if (PRIV(op)->op_state == 3)
throw std::runtime_error( goto resume_3;
"write operation failed ("+std::to_string(data->res)+" != "+std::to_string(data->iov.iov_len)+ else if (PRIV(op)->op_state == 5)
"). in-memory state is corrupted. AAAAAAAaaaaaaaaa!!!111" goto resume_5;
); else
} return 1;
PRIV(op)->pending_ops--; resume_2:
if (PRIV(op)->pending_ops == 0)
{
// Release used journal sectors // Release used journal sectors
release_journal_sectors(op); release_journal_sectors(op);
resume_3:
if (!disable_journal_fsync)
{
io_uring_sqe *sqe = get_sqe();
if (!sqe)
{
return 0;
}
ring_data_t *data = ((ring_data_t*)sqe->user_data);
my_uring_prep_fsync(sqe, journal.fd, IORING_FSYNC_DATASYNC);
data->iov = { 0 };
data->callback = [this, op](ring_data_t *data) { handle_stable_event(data, op); };
PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 0;
PRIV(op)->pending_ops = 1;
PRIV(op)->op_state = 4;
return 1;
}
resume_5:
obj_ver_id* v; obj_ver_id* v;
int i; int i;
for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++) for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++)
@ -157,6 +178,27 @@ void blockstore_impl_t::handle_rollback_event(ring_data_t *data, blockstore_op_t
// Acknowledge op // Acknowledge op
op->retval = 0; op->retval = 0;
FINISH_OP(op); FINISH_OP(op);
return 1;
}
void blockstore_impl_t::handle_rollback_event(ring_data_t *data, blockstore_op_t *op)
{
live = true;
if (data->res != data->iov.iov_len)
{
throw std::runtime_error(
"write operation failed ("+std::to_string(data->res)+" != "+std::to_string(data->iov.iov_len)+
"). in-memory state is corrupted. AAAAAAAaaaaaaaaa!!!111"
);
}
PRIV(op)->pending_ops--;
if (PRIV(op)->pending_ops == 0)
{
PRIV(op)->op_state++;
if (!continue_stable(op))
{
submit_queue.push_front(op);
}
} }
} }

View File

@ -40,6 +40,10 @@
int blockstore_impl_t::dequeue_stable(blockstore_op_t *op) int blockstore_impl_t::dequeue_stable(blockstore_op_t *op)
{ {
if (PRIV(op)->op_state)
{
return continue_stable(op);
}
obj_ver_id* v; obj_ver_id* v;
int i, todo = 0; int i, todo = 0;
for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++) for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++)
@ -127,25 +131,41 @@ int blockstore_impl_t::dequeue_stable(blockstore_op_t *op)
} }
PRIV(op)->max_flushed_journal_sector = 1 + journal.cur_sector; PRIV(op)->max_flushed_journal_sector = 1 + journal.cur_sector;
PRIV(op)->pending_ops = s; PRIV(op)->pending_ops = s;
PRIV(op)->op_state = 1;
return 1; return 1;
} }
void blockstore_impl_t::handle_stable_event(ring_data_t *data, blockstore_op_t *op) int blockstore_impl_t::continue_stable(blockstore_op_t *op)
{ {
live = true; if (PRIV(op)->op_state == 2)
if (data->res != data->iov.iov_len) goto resume_2;
{ else if (PRIV(op)->op_state == 3)
throw std::runtime_error( goto resume_3;
"write operation failed ("+std::to_string(data->res)+" != "+std::to_string(data->iov.iov_len)+ else if (PRIV(op)->op_state == 5)
"). in-memory state is corrupted. AAAAAAAaaaaaaaaa!!!111" goto resume_5;
); else
} return 1;
PRIV(op)->pending_ops--; resume_2:
if (PRIV(op)->pending_ops == 0)
{
// FIXME Oops. We must sync the device!
// Release used journal sectors // Release used journal sectors
release_journal_sectors(op); release_journal_sectors(op);
resume_3:
if (!disable_journal_fsync)
{
io_uring_sqe *sqe = get_sqe();
if (!sqe)
{
return 0;
}
ring_data_t *data = ((ring_data_t*)sqe->user_data);
my_uring_prep_fsync(sqe, journal.fd, IORING_FSYNC_DATASYNC);
data->iov = { 0 };
data->callback = [this, op](ring_data_t *data) { handle_stable_event(data, op); };
PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 0;
PRIV(op)->pending_ops = 1;
PRIV(op)->op_state = 4;
return 1;
}
resume_5:
// Mark dirty_db entries as stable, acknowledge op completion // Mark dirty_db entries as stable, acknowledge op completion
obj_ver_id* v; obj_ver_id* v;
int i; int i;
@ -192,5 +212,26 @@ void blockstore_impl_t::handle_stable_event(ring_data_t *data, blockstore_op_t *
// Acknowledge op // Acknowledge op
op->retval = 0; op->retval = 0;
FINISH_OP(op); FINISH_OP(op);
return 1;
}
void blockstore_impl_t::handle_stable_event(ring_data_t *data, blockstore_op_t *op)
{
live = true;
if (data->res != data->iov.iov_len)
{
throw std::runtime_error(
"write operation failed ("+std::to_string(data->res)+" != "+std::to_string(data->iov.iov_len)+
"). in-memory state is corrupted. AAAAAAAaaaaaaaaa!!!111"
);
}
PRIV(op)->pending_ops--;
if (PRIV(op)->pending_ops == 0)
{
PRIV(op)->op_state++;
if (!continue_stable(op))
{
submit_queue.push_front(op);
}
} }
} }

View File

@ -11,7 +11,7 @@
int blockstore_impl_t::dequeue_sync(blockstore_op_t *op) int blockstore_impl_t::dequeue_sync(blockstore_op_t *op)
{ {
if (PRIV(op)->sync_state == 0) if (PRIV(op)->op_state == 0)
{ {
stop_sync_submitted = false; stop_sync_submitted = false;
PRIV(op)->sync_big_writes.swap(unsynced_big_writes); PRIV(op)->sync_big_writes.swap(unsynced_big_writes);
@ -21,11 +21,11 @@ int blockstore_impl_t::dequeue_sync(blockstore_op_t *op)
unsynced_big_writes.clear(); unsynced_big_writes.clear();
unsynced_small_writes.clear(); unsynced_small_writes.clear();
if (PRIV(op)->sync_big_writes.size() > 0) if (PRIV(op)->sync_big_writes.size() > 0)
PRIV(op)->sync_state = SYNC_HAS_BIG; PRIV(op)->op_state = SYNC_HAS_BIG;
else if (PRIV(op)->sync_small_writes.size() > 0) else if (PRIV(op)->sync_small_writes.size() > 0)
PRIV(op)->sync_state = SYNC_HAS_SMALL; PRIV(op)->op_state = SYNC_HAS_SMALL;
else else
PRIV(op)->sync_state = SYNC_DONE; PRIV(op)->op_state = SYNC_DONE;
// Always add sync to in_progress_syncs because we clear unsynced_big_writes and unsynced_small_writes // Always add sync to in_progress_syncs because we clear unsynced_big_writes and unsynced_small_writes
PRIV(op)->prev_sync_count = in_progress_syncs.size(); PRIV(op)->prev_sync_count = in_progress_syncs.size();
PRIV(op)->in_progress_ptr = in_progress_syncs.insert(in_progress_syncs.end(), op); PRIV(op)->in_progress_ptr = in_progress_syncs.insert(in_progress_syncs.end(), op);
@ -38,7 +38,7 @@ int blockstore_impl_t::dequeue_sync(blockstore_op_t *op)
int blockstore_impl_t::continue_sync(blockstore_op_t *op) int blockstore_impl_t::continue_sync(blockstore_op_t *op)
{ {
auto cb = [this, op](ring_data_t *data) { handle_sync_event(data, op); }; auto cb = [this, op](ring_data_t *data) { handle_sync_event(data, op); };
if (PRIV(op)->sync_state == SYNC_HAS_SMALL) if (PRIV(op)->op_state == SYNC_HAS_SMALL)
{ {
// No big writes, just fsync the journal // No big writes, just fsync the journal
for (; PRIV(op)->sync_small_checked < PRIV(op)->sync_small_writes.size(); PRIV(op)->sync_small_checked++) for (; PRIV(op)->sync_small_checked < PRIV(op)->sync_small_writes.size(); PRIV(op)->sync_small_checked++)
@ -56,15 +56,15 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op)
prepare_journal_sector_write(journal, journal.cur_sector, sqe, cb); prepare_journal_sector_write(journal, journal.cur_sector, sqe, cb);
PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 1 + journal.cur_sector; PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 1 + journal.cur_sector;
PRIV(op)->pending_ops = 1; PRIV(op)->pending_ops = 1;
PRIV(op)->sync_state = SYNC_JOURNAL_WRITE_SENT; PRIV(op)->op_state = SYNC_JOURNAL_WRITE_SENT;
return 1; return 1;
} }
else else
{ {
PRIV(op)->sync_state = SYNC_JOURNAL_WRITE_DONE; PRIV(op)->op_state = SYNC_JOURNAL_WRITE_DONE;
} }
} }
if (PRIV(op)->sync_state == SYNC_HAS_BIG) if (PRIV(op)->op_state == SYNC_HAS_BIG)
{ {
for (; PRIV(op)->sync_big_checked < PRIV(op)->sync_big_writes.size(); PRIV(op)->sync_big_checked++) for (; PRIV(op)->sync_big_checked < PRIV(op)->sync_big_writes.size(); PRIV(op)->sync_big_checked++)
{ {
@ -83,15 +83,15 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op)
data->callback = cb; data->callback = cb;
PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 0; PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 0;
PRIV(op)->pending_ops = 1; PRIV(op)->pending_ops = 1;
PRIV(op)->sync_state = SYNC_DATA_SYNC_SENT; PRIV(op)->op_state = SYNC_DATA_SYNC_SENT;
return 1; return 1;
} }
else else
{ {
PRIV(op)->sync_state = SYNC_DATA_SYNC_DONE; PRIV(op)->op_state = SYNC_DATA_SYNC_DONE;
} }
} }
if (PRIV(op)->sync_state == SYNC_DATA_SYNC_DONE) if (PRIV(op)->op_state == SYNC_DATA_SYNC_DONE)
{ {
for (; PRIV(op)->sync_small_checked < PRIV(op)->sync_small_writes.size(); PRIV(op)->sync_small_checked++) for (; PRIV(op)->sync_small_checked < PRIV(op)->sync_small_writes.size(); PRIV(op)->sync_small_checked++)
{ {
@ -153,10 +153,10 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op)
} }
PRIV(op)->max_flushed_journal_sector = 1 + journal.cur_sector; PRIV(op)->max_flushed_journal_sector = 1 + journal.cur_sector;
PRIV(op)->pending_ops = s; PRIV(op)->pending_ops = s;
PRIV(op)->sync_state = SYNC_JOURNAL_WRITE_SENT; PRIV(op)->op_state = SYNC_JOURNAL_WRITE_SENT;
return 1; return 1;
} }
if (PRIV(op)->sync_state == SYNC_JOURNAL_WRITE_DONE) if (PRIV(op)->op_state == SYNC_JOURNAL_WRITE_DONE)
{ {
if (!disable_journal_fsync) if (!disable_journal_fsync)
{ {
@ -165,15 +165,15 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op)
data->iov = { 0 }; data->iov = { 0 };
data->callback = cb; data->callback = cb;
PRIV(op)->pending_ops = 1; PRIV(op)->pending_ops = 1;
PRIV(op)->sync_state = SYNC_JOURNAL_SYNC_SENT; PRIV(op)->op_state = SYNC_JOURNAL_SYNC_SENT;
return 1; return 1;
} }
else else
{ {
PRIV(op)->sync_state = SYNC_DONE; PRIV(op)->op_state = SYNC_DONE;
} }
} }
if (PRIV(op)->sync_state == SYNC_DONE) if (PRIV(op)->op_state == SYNC_DONE)
{ {
ack_sync(op); ack_sync(op);
} }
@ -196,17 +196,17 @@ void blockstore_impl_t::handle_sync_event(ring_data_t *data, blockstore_op_t *op
// Release used journal sectors // Release used journal sectors
release_journal_sectors(op); release_journal_sectors(op);
// Handle states // Handle states
if (PRIV(op)->sync_state == SYNC_DATA_SYNC_SENT) if (PRIV(op)->op_state == SYNC_DATA_SYNC_SENT)
{ {
PRIV(op)->sync_state = SYNC_DATA_SYNC_DONE; PRIV(op)->op_state = SYNC_DATA_SYNC_DONE;
} }
else if (PRIV(op)->sync_state == SYNC_JOURNAL_WRITE_SENT) else if (PRIV(op)->op_state == SYNC_JOURNAL_WRITE_SENT)
{ {
PRIV(op)->sync_state = SYNC_JOURNAL_WRITE_DONE; PRIV(op)->op_state = SYNC_JOURNAL_WRITE_DONE;
} }
else if (PRIV(op)->sync_state == SYNC_JOURNAL_SYNC_SENT) else if (PRIV(op)->op_state == SYNC_JOURNAL_SYNC_SENT)
{ {
PRIV(op)->sync_state = SYNC_DONE; PRIV(op)->op_state = SYNC_DONE;
ack_sync(op); ack_sync(op);
} }
else else
@ -218,7 +218,7 @@ void blockstore_impl_t::handle_sync_event(ring_data_t *data, blockstore_op_t *op
int blockstore_impl_t::ack_sync(blockstore_op_t *op) int blockstore_impl_t::ack_sync(blockstore_op_t *op)
{ {
if (PRIV(op)->sync_state == SYNC_DONE && PRIV(op)->prev_sync_count == 0) if (PRIV(op)->op_state == SYNC_DONE && PRIV(op)->prev_sync_count == 0)
{ {
// Remove dependency of subsequent syncs // Remove dependency of subsequent syncs
auto it = PRIV(op)->in_progress_ptr; auto it = PRIV(op)->in_progress_ptr;
@ -230,7 +230,7 @@ int blockstore_impl_t::ack_sync(blockstore_op_t *op)
{ {
auto & next_sync = *it++; auto & next_sync = *it++;
PRIV(next_sync)->prev_sync_count -= done_syncs; PRIV(next_sync)->prev_sync_count -= done_syncs;
if (PRIV(next_sync)->prev_sync_count == 0 && PRIV(next_sync)->sync_state == SYNC_DONE) if (PRIV(next_sync)->prev_sync_count == 0 && PRIV(next_sync)->op_state == SYNC_DONE)
{ {
done_syncs++; done_syncs++;
// Acknowledge next_sync // Acknowledge next_sync