Browse Source

Sync metadata & data after copying from journal

blocking-uring-test
Vitaliy Filippov 3 years ago
parent
commit
f1e236c6e8
  1. 218
      blockstore_flush.cpp
  2. 17
      blockstore_flush.h
  3. 5
      blockstore_stable.cpp
  4. 2
      blockstore_sync.cpp
  5. 1
      blockstore_write.cpp

218
blockstore_flush.cpp

@ -5,15 +5,30 @@ journal_flusher_t::journal_flusher_t(int flusher_count, blockstore *bs)
this->bs = bs;
this->flusher_count = flusher_count;
this->active_flushers = 0;
this->active_until_sync = 0;
this->sync_required = true;
this->sync_threshold = flusher_count == 1 ? 1 : flusher_count/2;
co = new journal_flusher_co[flusher_count];
for (int i = 0; i < flusher_count; i++)
{
co[i].bs = bs;
co[i].wait_state = 0;
co[i].flusher = this;
}
}
journal_flusher_co::journal_flusher_co()
{
wait_state = 0;
simple_callback = [this](ring_data_t* data)
{
if (data->res < 0)
{
throw new std::runtime_error("write operation failed. in-memory state is corrupted. AAAAAAAaaaaaaaaa!!!111");
}
wait_count--;
};
}
journal_flusher_t::~journal_flusher_t()
{
delete[] co;
@ -31,6 +46,16 @@ void journal_flusher_t::loop()
}
}
#define await_sqe(label) \
resume_##label:\
sqe = bs->get_sqe();\
if (!sqe)\
{\
wait_state = label;\
return;\
}\
data = ((ring_data_t*)sqe->user_data);
void journal_flusher_co::loop()
{
// This is much better than implementing the whole function as an FSM
@ -49,6 +74,15 @@ void journal_flusher_co::loop()
goto resume_6;
else if (wait_state == 7)
goto resume_7;
else if (wait_state == 8)
goto resume_8;
else if (wait_state == 9)
goto resume_9;
else if (wait_state == 10)
goto resume_10;
else if (wait_state == 11)
goto resume_11;
resume_0:
if (!flusher->flush_queue.size())
return;
cur = flusher->flush_queue.front();
@ -57,6 +91,7 @@ void journal_flusher_co::loop()
if (dirty_it != bs->dirty_db.end())
{
flusher->active_flushers++;
flusher->active_until_sync++;
v.clear();
wait_count = 0;
clean_loc = UINT64_MAX;
@ -77,21 +112,10 @@ void journal_flusher_co::loop()
if (it == v.end() || it->offset > offset)
{
submit_len = it->offset >= offset+len ? len : it->offset-offset;
resume_1:
sqe = bs->get_sqe();
if (!sqe)
{
// Can't submit read, ring is full
wait_state = 1;
return;
}
await_sqe(1);
v.insert(it, (copy_buffer_t){ .offset = offset, .len = submit_len, .buf = memalign(512, submit_len) });
data = ((ring_data_t*)sqe->user_data);
data->iov = (struct iovec){ v.end()->buf, (size_t)submit_len };
data->callback = [this](ring_data_t* data)
{
wait_count--;
};
data->iov = (struct iovec){ v.back().buf, (size_t)submit_len };
data->callback = simple_callback;
io_uring_prep_readv(
sqe, bs->journal.fd, &data->iov, 1, bs->journal.offset + dirty_it->second.location + offset
);
@ -150,17 +174,14 @@ void journal_flusher_co::loop()
.buf = memalign(512, 512),
.usage_count = 1,
}).first;
resume_2:
sqe = bs->get_sqe();
if (!sqe)
{
wait_state = 2;
return;
}
data = ((ring_data_t*)sqe->user_data);
await_sqe(2);
data->iov = (struct iovec){ meta_it->second.buf, 512 };
data->callback = [this](ring_data_t* data)
{
if (data->res < 0)
{
throw new std::runtime_error("write operation failed. in-memory state is corrupted. AAAAAAAaaaaaaaaa!!!111");
}
meta_it->second.state = 1;
wait_count--;
};
@ -173,80 +194,103 @@ void journal_flusher_co::loop()
meta_it->second.usage_count++;
wait_state = 3;
resume_3:
// After reads complete we submit writes
if (wait_count == 0)
if (wait_count > 0)
return;
// Reads completed, submit writes
for (it = v.begin(); it != v.end(); it++)
{
await_sqe(4);
data->iov = (struct iovec){ it->buf, (size_t)it->len };
data->callback = simple_callback;
io_uring_prep_writev(
sqe, bs->data_fd, &data->iov, 1, bs->data_offset + clean_loc + it->offset
);
wait_count++;
}
// And a metadata write
resume_5:
if (meta_it->second.state == 0)
{
// metadata sector is still being read, wait for it
wait_state = 5;
return;
}
*((clean_disk_entry*)meta_it->second.buf + meta_pos) = {
.oid = cur.oid,
.version = cur.version,
};
// I consider unordered writes to data & metadata safe here, because
// "dirty" entries always override "clean" entries in our case
await_sqe(6);
data->iov = (struct iovec){ meta_it->second.buf, 512 };
data->callback = simple_callback;
io_uring_prep_writev(
sqe, bs->meta_fd, &data->iov, 1, bs->meta_offset + meta_sector
);
wait_count++;
wait_state = 7;
resume_7:
if (wait_count > 0)
return;
// Done, free all buffers
meta_it->second.usage_count--;
if (meta_it->second.usage_count == 0)
{
free(meta_it->second.buf);
flusher->meta_sectors.erase(meta_it);
}
for (it = v.begin(); it != v.end(); it++)
{
free(it->buf);
}
v.clear();
flusher->active_until_sync--;
if (flusher->sync_required)
{
for (it = v.begin(); it != v.end(); it++)
// And sync everything (in batches - not per each operation!)
cur_sync = flusher->syncs.end();
if (cur_sync == flusher->syncs.begin())
cur_sync = flusher->syncs.emplace(flusher->syncs.end(), (flusher_sync_t){ .ready_count = 0, .state = 0 });
else
cur_sync--;
cur_sync->ready_count++;
if (cur_sync->ready_count >= flusher->sync_threshold ||
!flusher->active_until_sync && !flusher->flush_queue.size())
{
resume_4:
sqe = bs->get_sqe();
if (!sqe)
// Sync batch is ready. Do it.
await_sqe(9);
data->callback = simple_callback;
io_uring_prep_fsync(sqe, bs->data_fd, 0);
wait_count++;
if (bs->meta_fd != bs->data_fd)
{
// Can't submit a write, ring is full
wait_state = 4;
return;
await_sqe(10);
data->callback = simple_callback;
io_uring_prep_fsync(sqe, bs->meta_fd, 0);
wait_count++;
}
data = ((ring_data_t*)sqe->user_data);
data->iov = (struct iovec){ it->buf, (size_t)it->len };
data->callback = [this](ring_data_t* data)
{
wait_count--;
};
io_uring_prep_writev(
sqe, bs->data_fd, &data->iov, 1, bs->data_offset + clean_loc + it->offset
);
wait_count++;
}
// And a metadata write
resume_5:
if (meta_it->second.state == 0)
{
// metadata sector is still being read, wait for it
wait_state = 5;
return;
wait_state = 11;
resume_11:
if (wait_count > 0)
return;
// Sync completed. All previous coroutines waiting for it must be resumed
cur_sync->state = 1;
}
*((clean_disk_entry*)meta_it->second.buf + meta_pos) = {
.oid = cur.oid,
.version = cur.version,
};
resume_6:
sqe = bs->get_sqe();
if (!sqe)
// Wait until someone else sends and completes a sync.
resume_8:
if (!cur_sync->state)
{
// Can't submit a write, ring is full
wait_state = 6;
wait_state = 8;
return;
}
data = ((ring_data_t*)sqe->user_data);
data->iov = (struct iovec){ meta_it->second.buf, 512 };
data->callback = [this](ring_data_t* data)
cur_sync->ready_count--;
if (cur_sync->ready_count == 0)
{
wait_count--;
};
io_uring_prep_writev(
sqe, bs->meta_fd, &data->iov, 1, bs->meta_offset + meta_sector
);
wait_count++;
wait_state = 7;
resume_7:
// Done, free all buffers
if (wait_count == 0)
{
meta_it->second.usage_count--;
if (meta_it->second.usage_count == 0)
{
free(meta_it->second.buf);
flusher->meta_sectors.erase(meta_it);
}
for (it = v.begin(); it != v.end(); it++)
{
free(it->buf);
}
v.clear();
wait_state = 0;
flusher->active_flushers--;
flusher->syncs.erase(cur_sync);
}
// FIXME Now sync everything
}
wait_state = 0;
flusher->active_flushers--;
goto resume_0;
}
}

17
blockstore_flush.h

@ -12,10 +12,16 @@ struct meta_sector_t
int usage_count;
};
struct flusher_sync_t
{
int ready_count;
int state;
};
class journal_flusher_t;
// Journal flusher coroutine
struct journal_flusher_co
class journal_flusher_co
{
blockstore *bs;
journal_flusher_t *flusher;
@ -29,8 +35,11 @@ struct journal_flusher_co
std::vector<copy_buffer_t>::iterator it;
uint64_t offset, len, submit_len, clean_loc, meta_sector, meta_pos;
std::map<uint64_t, meta_sector_t>::iterator meta_it;
std::function<void(ring_data_t*)> simple_callback;
std::list<flusher_sync_t>::iterator cur_sync;
friend class journal_flusher_t;
public:
journal_flusher_co();
void loop();
};
@ -38,10 +47,14 @@ public:
class journal_flusher_t
{
int flusher_count;
int active_flushers;
int sync_threshold;
bool sync_required;
journal_flusher_co *co;
blockstore *bs;
friend class journal_flusher_co;
int active_flushers, active_until_sync;
std::list<flusher_sync_t> syncs;
public:
std::map<uint64_t, meta_sector_t> meta_sectors;
std::deque<obj_ver_id> flush_queue;

5
blockstore_stable.cpp

@ -2,9 +2,6 @@
// Stabilize small write:
// 1) Copy data from the journal to the data device
// Sync it before writing metadata if we want to keep metadata consistent
// Overall it's optional because it can be replayed from the journal until
// it's cleared, and reads are also fulfilled from the journal
// 2) Increase version on the metadata device and sync it
// 3) Advance clean_db entry's version, clear previous journal entries
//
@ -112,8 +109,6 @@ void blockstore::handle_stable_event(ring_data_t *data, blockstore_operation *op
{
if (data->res < 0)
{
// sync error
// FIXME: our state becomes corrupted after a write error. maybe do something better than just die
throw new std::runtime_error("write operation failed. in-memory state is corrupted. AAAAAAAaaaaaaaaa!!!111");
}
op->pending_ops--;

2
blockstore_sync.cpp

@ -111,8 +111,6 @@ void blockstore::handle_sync_event(ring_data_t *data, blockstore_operation *op)
{
if (data->res < 0)
{
// sync error
// FIXME: our state becomes corrupted after a write error. maybe do something better than just die
throw new std::runtime_error("write operation failed. in-memory state is corrupted. AAAAAAAaaaaaaaaa!!!111");
}
op->pending_ops--;

1
blockstore_write.cpp

@ -149,7 +149,6 @@ void blockstore::handle_write_event(ring_data_t *data, blockstore_operation *op)
{
if (data->res < 0)
{
// write error
// FIXME: our state becomes corrupted after a write error. maybe do something better than just die
throw new std::runtime_error("write operation failed. in-memory state is corrupted. AAAAAAAaaaaaaaaa!!!111");
}

Loading…
Cancel
Save