Fix two more cluster client bugs

1) Sync could delete an unfinished write due to the lack of ordering (fixed by introducing syncing_writes)
2) Writes could be postponed indefinitely due to bad resuming of operations after a sync
Vitaliy Filippov 2020-06-27 02:13:33 +03:00
parent 592bcd3699
commit 9b33f598d3
2 changed files with 43 additions and 21 deletions

View File

@ -18,6 +18,20 @@ cluster_client_t::cluster_client_t(ring_loop_t *ringloop, timerfd_manager_t *tfd
else if (unsynced_writes.size()) else if (unsynced_writes.size())
{ {
// peer_osd just dropped connection // peer_osd just dropped connection
for (auto op: syncing_writes)
{
for (auto & part: op->parts)
{
if (part.osd_num == peer_osd && part.done)
{
// repeat this operation
part.osd_num = 0;
part.done = false;
assert(!part.sent);
op->done_count--;
}
}
}
for (auto op: unsynced_writes) for (auto op: unsynced_writes)
{ {
for (auto & part: op->parts) for (auto & part: op->parts)
@ -206,6 +220,10 @@ void cluster_client_t::on_change_hook(json11::Json::object & changes)
{ {
op->needs_reslice = true; op->needs_reslice = true;
} }
for (auto op: syncing_writes)
{
op->needs_reslice = true;
}
pg_count = st_cli.pg_config.size(); pg_count = st_cli.pg_config.size();
if (pg_count) if (pg_count)
{ {
@ -281,19 +299,19 @@ void cluster_client_t::execute(cluster_op_t *op)
{ {
if (next_writes.size() > 0) if (next_writes.size() > 0)
{ {
assert(cur_sync);
next_writes.push_back(op); next_writes.push_back(op);
return; return;
} }
if (queued_bytes >= client_dirty_limit) if (queued_bytes >= client_dirty_limit)
{ {
// Push an extra SYNC operation to flush previous writes // Push an extra SYNC operation to flush previous writes
next_writes.push_back(op);
cluster_op_t *sync_op = new cluster_op_t; cluster_op_t *sync_op = new cluster_op_t;
sync_op->is_internal = true; sync_op->is_internal = true;
sync_op->opcode = OSD_OP_SYNC; sync_op->opcode = OSD_OP_SYNC;
sync_op->callback = [](cluster_op_t* sync_op) {}; sync_op->callback = [](cluster_op_t* sync_op) {};
execute_sync(sync_op); execute_sync(sync_op);
next_writes.push_back(op);
queued_bytes = 0;
return; return;
} }
queued_bytes += op->len; queued_bytes += op->len;
@ -512,10 +530,8 @@ void cluster_client_t::execute_sync(cluster_op_t *op)
op->retval = 0; op->retval = 0;
std::function<void(cluster_op_t*)>(op->callback)(op); std::function<void(cluster_op_t*)>(op->callback)(op);
} }
else if (next_writes.size() > 0 || cur_sync != NULL) else if (cur_sync != NULL)
{ {
// There are some writes postponed to be executed after SYNC
// Push this SYNC after them
next_writes.push_back(op); next_writes.push_back(op);
} }
else else
@ -565,6 +581,7 @@ void cluster_client_t::continue_sync()
return; return;
} }
} }
syncing_writes.swap(unsynced_writes);
// Post sync to affected OSDs // Post sync to affected OSDs
cur_sync->parts.resize(sync_osds.size()); cur_sync->parts.resize(sync_osds.size());
int i = 0; int i = 0;
@ -584,6 +601,18 @@ void cluster_client_t::continue_sync()
void cluster_client_t::finish_sync() void cluster_client_t::finish_sync()
{ {
int retval = cur_sync->retval; int retval = cur_sync->retval;
if (retval != 0)
{
for (auto op: syncing_writes)
{
if (op->done_count < op->parts.size())
{
cur_ops.insert(op);
}
}
unsynced_writes.insert(unsynced_writes.begin(), syncing_writes.begin(), syncing_writes.end());
syncing_writes.clear();
}
if (retval == -EPIPE) if (retval == -EPIPE)
{ {
// Retry later // Retry later
@ -596,8 +625,9 @@ void cluster_client_t::finish_sync()
std::function<void(cluster_op_t*)>(cur_sync->callback)(cur_sync); std::function<void(cluster_op_t*)>(cur_sync->callback)(cur_sync);
if (!retval) if (!retval)
{ {
for (auto op: unsynced_writes) for (auto op: syncing_writes)
{ {
assert(op->sent_count == 0);
if (op->is_internal) if (op->is_internal)
{ {
if (op->buf) if (op->buf)
@ -605,24 +635,15 @@ void cluster_client_t::finish_sync()
delete op; delete op;
} }
} }
unsynced_writes.clear(); syncing_writes.clear();
} }
cur_sync = NULL; cur_sync = NULL;
while (next_writes.size() > 0) queued_bytes = 0;
std::vector<cluster_op_t*> next_wr_copy;
next_wr_copy.swap(next_writes);
for (auto next_op: next_wr_copy)
{ {
if (next_writes[0]->opcode == OSD_OP_SYNC) execute(next_op);
{
cur_sync = next_writes[0];
next_writes.erase(next_writes.begin(), next_writes.begin()+1);
continue_sync();
}
else
{
auto wr = next_writes[0];
cur_ops.insert(wr);
next_writes.erase(next_writes.begin(), next_writes.begin()+1);
continue_rw(wr);
}
} }
} }

View File

@ -72,6 +72,7 @@ class cluster_client_t
// unsynced operations are copied in memory to allow replay when cluster isn't in the immediate_commit mode // unsynced operations are copied in memory to allow replay when cluster isn't in the immediate_commit mode
// unsynced_writes are replayed in any order (because only the SYNC operation guarantees ordering) // unsynced_writes are replayed in any order (because only the SYNC operation guarantees ordering)
std::vector<cluster_op_t*> unsynced_writes; std::vector<cluster_op_t*> unsynced_writes;
std::vector<cluster_op_t*> syncing_writes;
cluster_op_t* cur_sync = NULL; cluster_op_t* cur_sync = NULL;
std::vector<cluster_op_t*> next_writes; std::vector<cluster_op_t*> next_writes;
std::vector<cluster_op_t*> offline_ops; std::vector<cluster_op_t*> offline_ops;