Fix infinite looping in continue_recovery_op() when pg_cancel_write_queue() is called

Vitaliy Filippov 2020-10-20 21:56:01 +00:00
parent 9abf3c17c9
commit 738ad5af79
2 changed files with 21 additions and 8 deletions

View File

@ -258,6 +258,10 @@ void osd_t::submit_recovery_op(osd_recovery_op_t *op)
.len = 0, .len = 0,
}, },
}; };
if (log_level > 2)
{
printf("Submitting recovery operation for %lx:%lx\n", op->oid.inode, op->oid.stripe);
}
op->osd_op->callback = [this, op](osd_op_t *osd_op) op->osd_op->callback = [this, op](osd_op_t *osd_op)
{ {
// Don't sync the write, it will be synced by our regular sync coroutine // Don't sync the write, it will be synced by our regular sync coroutine
@ -267,6 +271,11 @@ void osd_t::submit_recovery_op(osd_recovery_op_t *op)
if (osd_op->reply.hdr.retval == -EPIPE) if (osd_op->reply.hdr.retval == -EPIPE)
{ {
// PG is stopped or one of the OSDs is gone, error is harmless // PG is stopped or one of the OSDs is gone, error is harmless
printf(
"Recovery operation failed with object %lx:%lx (PG %u/%u)\n",
op->oid.inode, op->oid.stripe, INODE_POOL(op->oid.inode),
map_to_pg(op->oid, st_cli.pool_config.at(INODE_POOL(op->oid.inode)).pg_stripe_size)
);
} }
else else
{ {

View File

@ -552,24 +552,28 @@ void osd_t::submit_primary_stab_subops(osd_op_t *cur_op)
void osd_t::pg_cancel_write_queue(pg_t & pg, osd_op_t *first_op, object_id oid, int retval) void osd_t::pg_cancel_write_queue(pg_t & pg, osd_op_t *first_op, object_id oid, int retval)
{ {
auto st_it = pg.write_queue.find(oid), it = st_it; auto st_it = pg.write_queue.find(oid), it = st_it;
finish_op(first_op, retval); if (it == pg.write_queue.end() || it->second != first_op)
if (it != pg.write_queue.end() && it->second == first_op)
{
it++;
}
else
{ {
// Write queue doesn't match the first operation. // Write queue doesn't match the first operation.
// first_op is a leftover operation from the previous peering of the same PG. // first_op is a leftover operation from the previous peering of the same PG.
finish_op(first_op, retval);
return; return;
} }
while (it != pg.write_queue.end() && it->first == oid) std::vector<osd_op_t*> cancel_ops;
while (it != pg.write_queue.end())
{ {
finish_op(it->second, retval); cancel_ops.push_back(it->second);
it++; it++;
} }
if (st_it != it) if (st_it != it)
{ {
// First erase them and then run finish_op() for the sake of reenterability
// Calling finish_op() on a live iterator previously triggered a bug where some
// of the OSDs were looping infinitely if you stopped all of them with kill -INT during recovery
pg.write_queue.erase(st_it, it); pg.write_queue.erase(st_it, it);
for (auto op: cancel_ops)
{
finish_op(op, retval);
}
} }
} }