|
|
|
@ -5,15 +5,30 @@ journal_flusher_t::journal_flusher_t(int flusher_count, blockstore *bs) |
|
|
|
|
this->bs = bs; |
|
|
|
|
this->flusher_count = flusher_count; |
|
|
|
|
this->active_flushers = 0; |
|
|
|
|
this->active_until_sync = 0; |
|
|
|
|
this->sync_required = true; |
|
|
|
|
this->sync_threshold = flusher_count == 1 ? 1 : flusher_count/2; |
|
|
|
|
co = new journal_flusher_co[flusher_count]; |
|
|
|
|
for (int i = 0; i < flusher_count; i++) |
|
|
|
|
{ |
|
|
|
|
co[i].bs = bs; |
|
|
|
|
co[i].wait_state = 0; |
|
|
|
|
co[i].flusher = this; |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
journal_flusher_co::journal_flusher_co() |
|
|
|
|
{ |
|
|
|
|
wait_state = 0; |
|
|
|
|
simple_callback = [this](ring_data_t* data) |
|
|
|
|
{ |
|
|
|
|
if (data->res < 0) |
|
|
|
|
{ |
|
|
|
|
throw new std::runtime_error("write operation failed. in-memory state is corrupted. AAAAAAAaaaaaaaaa!!!111"); |
|
|
|
|
} |
|
|
|
|
wait_count--; |
|
|
|
|
}; |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
journal_flusher_t::~journal_flusher_t() |
|
|
|
|
{ |
|
|
|
|
delete[] co; |
|
|
|
@ -31,6 +46,16 @@ void journal_flusher_t::loop() |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
#define await_sqe(label) \ |
|
|
|
|
resume_##label:\
|
|
|
|
|
sqe = bs->get_sqe();\
|
|
|
|
|
if (!sqe)\
|
|
|
|
|
{\
|
|
|
|
|
wait_state = label;\
|
|
|
|
|
return;\
|
|
|
|
|
}\
|
|
|
|
|
data = ((ring_data_t*)sqe->user_data); |
|
|
|
|
|
|
|
|
|
void journal_flusher_co::loop() |
|
|
|
|
{ |
|
|
|
|
// This is much better than implementing the whole function as an FSM
|
|
|
|
@ -49,6 +74,15 @@ void journal_flusher_co::loop() |
|
|
|
|
goto resume_6; |
|
|
|
|
else if (wait_state == 7) |
|
|
|
|
goto resume_7; |
|
|
|
|
else if (wait_state == 8) |
|
|
|
|
goto resume_8; |
|
|
|
|
else if (wait_state == 9) |
|
|
|
|
goto resume_9; |
|
|
|
|
else if (wait_state == 10) |
|
|
|
|
goto resume_10; |
|
|
|
|
else if (wait_state == 11) |
|
|
|
|
goto resume_11; |
|
|
|
|
resume_0: |
|
|
|
|
if (!flusher->flush_queue.size()) |
|
|
|
|
return; |
|
|
|
|
cur = flusher->flush_queue.front(); |
|
|
|
@ -57,6 +91,7 @@ void journal_flusher_co::loop() |
|
|
|
|
if (dirty_it != bs->dirty_db.end()) |
|
|
|
|
{ |
|
|
|
|
flusher->active_flushers++; |
|
|
|
|
flusher->active_until_sync++; |
|
|
|
|
v.clear(); |
|
|
|
|
wait_count = 0; |
|
|
|
|
clean_loc = UINT64_MAX; |
|
|
|
@ -77,21 +112,10 @@ void journal_flusher_co::loop() |
|
|
|
|
if (it == v.end() || it->offset > offset) |
|
|
|
|
{ |
|
|
|
|
submit_len = it->offset >= offset+len ? len : it->offset-offset; |
|
|
|
|
resume_1: |
|
|
|
|
sqe = bs->get_sqe(); |
|
|
|
|
if (!sqe) |
|
|
|
|
{ |
|
|
|
|
// Can't submit read, ring is full
|
|
|
|
|
wait_state = 1; |
|
|
|
|
return; |
|
|
|
|
} |
|
|
|
|
await_sqe(1); |
|
|
|
|
v.insert(it, (copy_buffer_t){ .offset = offset, .len = submit_len, .buf = memalign(512, submit_len) }); |
|
|
|
|
data = ((ring_data_t*)sqe->user_data); |
|
|
|
|
data->iov = (struct iovec){ v.end()->buf, (size_t)submit_len }; |
|
|
|
|
data->callback = [this](ring_data_t* data) |
|
|
|
|
{ |
|
|
|
|
wait_count--; |
|
|
|
|
}; |
|
|
|
|
data->iov = (struct iovec){ v.back().buf, (size_t)submit_len }; |
|
|
|
|
data->callback = simple_callback; |
|
|
|
|
io_uring_prep_readv( |
|
|
|
|
sqe, bs->journal.fd, &data->iov, 1, bs->journal.offset + dirty_it->second.location + offset |
|
|
|
|
); |
|
|
|
@ -150,17 +174,14 @@ void journal_flusher_co::loop() |
|
|
|
|
.buf = memalign(512, 512), |
|
|
|
|
.usage_count = 1, |
|
|
|
|
}).first; |
|
|
|
|
resume_2: |
|
|
|
|
sqe = bs->get_sqe(); |
|
|
|
|
if (!sqe) |
|
|
|
|
{ |
|
|
|
|
wait_state = 2; |
|
|
|
|
return; |
|
|
|
|
} |
|
|
|
|
data = ((ring_data_t*)sqe->user_data); |
|
|
|
|
await_sqe(2); |
|
|
|
|
data->iov = (struct iovec){ meta_it->second.buf, 512 }; |
|
|
|
|
data->callback = [this](ring_data_t* data) |
|
|
|
|
{ |
|
|
|
|
if (data->res < 0) |
|
|
|
|
{ |
|
|
|
|
throw new std::runtime_error("write operation failed. in-memory state is corrupted. AAAAAAAaaaaaaaaa!!!111"); |
|
|
|
|
} |
|
|
|
|
meta_it->second.state = 1; |
|
|
|
|
wait_count--; |
|
|
|
|
}; |
|
|
|
@ -173,80 +194,103 @@ void journal_flusher_co::loop() |
|
|
|
|
meta_it->second.usage_count++; |
|
|
|
|
wait_state = 3; |
|
|
|
|
resume_3: |
|
|
|
|
// After reads complete we submit writes
|
|
|
|
|
if (wait_count == 0) |
|
|
|
|
if (wait_count > 0) |
|
|
|
|
return; |
|
|
|
|
// Reads completed, submit writes
|
|
|
|
|
for (it = v.begin(); it != v.end(); it++) |
|
|
|
|
{ |
|
|
|
|
await_sqe(4); |
|
|
|
|
data->iov = (struct iovec){ it->buf, (size_t)it->len }; |
|
|
|
|
data->callback = simple_callback; |
|
|
|
|
io_uring_prep_writev( |
|
|
|
|
sqe, bs->data_fd, &data->iov, 1, bs->data_offset + clean_loc + it->offset |
|
|
|
|
); |
|
|
|
|
wait_count++; |
|
|
|
|
} |
|
|
|
|
// And a metadata write
|
|
|
|
|
resume_5: |
|
|
|
|
if (meta_it->second.state == 0) |
|
|
|
|
{ |
|
|
|
|
// metadata sector is still being read, wait for it
|
|
|
|
|
wait_state = 5; |
|
|
|
|
return; |
|
|
|
|
} |
|
|
|
|
*((clean_disk_entry*)meta_it->second.buf + meta_pos) = { |
|
|
|
|
.oid = cur.oid, |
|
|
|
|
.version = cur.version, |
|
|
|
|
}; |
|
|
|
|
// I consider unordered writes to data & metadata safe here, because
|
|
|
|
|
// "dirty" entries always override "clean" entries in our case
|
|
|
|
|
await_sqe(6); |
|
|
|
|
data->iov = (struct iovec){ meta_it->second.buf, 512 }; |
|
|
|
|
data->callback = simple_callback; |
|
|
|
|
io_uring_prep_writev( |
|
|
|
|
sqe, bs->meta_fd, &data->iov, 1, bs->meta_offset + meta_sector |
|
|
|
|
); |
|
|
|
|
wait_count++; |
|
|
|
|
wait_state = 7; |
|
|
|
|
resume_7: |
|
|
|
|
if (wait_count > 0) |
|
|
|
|
return; |
|
|
|
|
// Done, free all buffers
|
|
|
|
|
meta_it->second.usage_count--; |
|
|
|
|
if (meta_it->second.usage_count == 0) |
|
|
|
|
{ |
|
|
|
|
free(meta_it->second.buf); |
|
|
|
|
flusher->meta_sectors.erase(meta_it); |
|
|
|
|
} |
|
|
|
|
for (it = v.begin(); it != v.end(); it++) |
|
|
|
|
{ |
|
|
|
|
free(it->buf); |
|
|
|
|
} |
|
|
|
|
v.clear(); |
|
|
|
|
flusher->active_until_sync--; |
|
|
|
|
if (flusher->sync_required) |
|
|
|
|
{ |
|
|
|
|
for (it = v.begin(); it != v.end(); it++) |
|
|
|
|
// And sync everything (in batches - not per each operation!)
|
|
|
|
|
cur_sync = flusher->syncs.end(); |
|
|
|
|
if (cur_sync == flusher->syncs.begin()) |
|
|
|
|
cur_sync = flusher->syncs.emplace(flusher->syncs.end(), (flusher_sync_t){ .ready_count = 0, .state = 0 }); |
|
|
|
|
else |
|
|
|
|
cur_sync--; |
|
|
|
|
cur_sync->ready_count++; |
|
|
|
|
if (cur_sync->ready_count >= flusher->sync_threshold || |
|
|
|
|
!flusher->active_until_sync && !flusher->flush_queue.size()) |
|
|
|
|
{ |
|
|
|
|
resume_4: |
|
|
|
|
sqe = bs->get_sqe(); |
|
|
|
|
if (!sqe) |
|
|
|
|
// Sync batch is ready. Do it.
|
|
|
|
|
await_sqe(9); |
|
|
|
|
data->callback = simple_callback; |
|
|
|
|
io_uring_prep_fsync(sqe, bs->data_fd, 0); |
|
|
|
|
wait_count++; |
|
|
|
|
if (bs->meta_fd != bs->data_fd) |
|
|
|
|
{ |
|
|
|
|
// Can't submit a write, ring is full
|
|
|
|
|
wait_state = 4; |
|
|
|
|
return; |
|
|
|
|
await_sqe(10); |
|
|
|
|
data->callback = simple_callback; |
|
|
|
|
io_uring_prep_fsync(sqe, bs->meta_fd, 0); |
|
|
|
|
wait_count++; |
|
|
|
|
} |
|
|
|
|
data = ((ring_data_t*)sqe->user_data); |
|
|
|
|
data->iov = (struct iovec){ it->buf, (size_t)it->len }; |
|
|
|
|
data->callback = [this](ring_data_t* data) |
|
|
|
|
{ |
|
|
|
|
wait_count--; |
|
|
|
|
}; |
|
|
|
|
io_uring_prep_writev( |
|
|
|
|
sqe, bs->data_fd, &data->iov, 1, bs->data_offset + clean_loc + it->offset |
|
|
|
|
); |
|
|
|
|
wait_count++; |
|
|
|
|
} |
|
|
|
|
// And a metadata write
|
|
|
|
|
resume_5: |
|
|
|
|
if (meta_it->second.state == 0) |
|
|
|
|
{ |
|
|
|
|
// metadata sector is still being read, wait for it
|
|
|
|
|
wait_state = 5; |
|
|
|
|
return; |
|
|
|
|
wait_state = 11; |
|
|
|
|
resume_11: |
|
|
|
|
if (wait_count > 0) |
|
|
|
|
return; |
|
|
|
|
// Sync completed. All previous coroutines waiting for it must be resumed
|
|
|
|
|
cur_sync->state = 1; |
|
|
|
|
} |
|
|
|
|
*((clean_disk_entry*)meta_it->second.buf + meta_pos) = { |
|
|
|
|
.oid = cur.oid, |
|
|
|
|
.version = cur.version, |
|
|
|
|
}; |
|
|
|
|
resume_6: |
|
|
|
|
sqe = bs->get_sqe(); |
|
|
|
|
if (!sqe) |
|
|
|
|
// Wait until someone else sends and completes a sync.
|
|
|
|
|
resume_8: |
|
|
|
|
if (!cur_sync->state) |
|
|
|
|
{ |
|
|
|
|
// Can't submit a write, ring is full
|
|
|
|
|
wait_state = 6; |
|
|
|
|
wait_state = 8; |
|
|
|
|
return; |
|
|
|
|
} |
|
|
|
|
data = ((ring_data_t*)sqe->user_data); |
|
|
|
|
data->iov = (struct iovec){ meta_it->second.buf, 512 }; |
|
|
|
|
data->callback = [this](ring_data_t* data) |
|
|
|
|
cur_sync->ready_count--; |
|
|
|
|
if (cur_sync->ready_count == 0) |
|
|
|
|
{ |
|
|
|
|
wait_count--; |
|
|
|
|
}; |
|
|
|
|
io_uring_prep_writev( |
|
|
|
|
sqe, bs->meta_fd, &data->iov, 1, bs->meta_offset + meta_sector |
|
|
|
|
); |
|
|
|
|
wait_count++; |
|
|
|
|
wait_state = 7; |
|
|
|
|
resume_7: |
|
|
|
|
// Done, free all buffers
|
|
|
|
|
if (wait_count == 0) |
|
|
|
|
{ |
|
|
|
|
meta_it->second.usage_count--; |
|
|
|
|
if (meta_it->second.usage_count == 0) |
|
|
|
|
{ |
|
|
|
|
free(meta_it->second.buf); |
|
|
|
|
flusher->meta_sectors.erase(meta_it); |
|
|
|
|
} |
|
|
|
|
for (it = v.begin(); it != v.end(); it++) |
|
|
|
|
{ |
|
|
|
|
free(it->buf); |
|
|
|
|
} |
|
|
|
|
v.clear(); |
|
|
|
|
wait_state = 0; |
|
|
|
|
flusher->active_flushers--; |
|
|
|
|
flusher->syncs.erase(cur_sync); |
|
|
|
|
} |
|
|
|
|
// FIXME Now sync everything
|
|
|
|
|
} |
|
|
|
|
wait_state = 0; |
|
|
|
|
flusher->active_flushers--; |
|
|
|
|
goto resume_0; |
|
|
|
|
} |
|
|
|
|
} |
|
|
|
|