Fix infinite looping in continue_recovery_op() when pg_cancel_write_queue() is called

Vitaliy Filippov 2020-10-20 21:56:01 +00:00
rodzic 9abf3c17c9
commit 738ad5af79
2 zmienionych plików z 21 dodań i 8 usunięć

Wyświetl plik

@ -258,6 +258,10 @@ void osd_t::submit_recovery_op(osd_recovery_op_t *op)
.len = 0,
},
};
if (log_level > 2)
{
printf("Submitting recovery operation for %lx:%lx\n", op->oid.inode, op->oid.stripe);
}
op->osd_op->callback = [this, op](osd_op_t *osd_op)
{
// Don't sync the write, it will be synced by our regular sync coroutine
@ -267,6 +271,11 @@ void osd_t::submit_recovery_op(osd_recovery_op_t *op)
if (osd_op->reply.hdr.retval == -EPIPE)
{
// PG is stopped or one of the OSDs is gone, error is harmless
printf(
"Recovery operation failed with object %lx:%lx (PG %u/%u)\n",
op->oid.inode, op->oid.stripe, INODE_POOL(op->oid.inode),
map_to_pg(op->oid, st_cli.pool_config.at(INODE_POOL(op->oid.inode)).pg_stripe_size)
);
}
else
{

Wyświetl plik

@ -552,24 +552,28 @@ void osd_t::submit_primary_stab_subops(osd_op_t *cur_op)
void osd_t::pg_cancel_write_queue(pg_t & pg, osd_op_t *first_op, object_id oid, int retval)
{
auto st_it = pg.write_queue.find(oid), it = st_it;
finish_op(first_op, retval);
if (it != pg.write_queue.end() && it->second == first_op)
{
it++;
}
else
if (it == pg.write_queue.end() || it->second != first_op)
{
// Write queue doesn't match the first operation.
// first_op is a leftover operation from the previous peering of the same PG.
finish_op(first_op, retval);
return;
}
while (it != pg.write_queue.end() && it->first == oid)
std::vector<osd_op_t*> cancel_ops;
while (it != pg.write_queue.end())
{
finish_op(it->second, retval);
cancel_ops.push_back(it->second);
it++;
}
if (st_it != it)
{
// First erase them and then run finish_op() for the sake of reenterability
// Calling finish_op() on a live iterator previously triggered a bug where some
// of the OSDs were looping infinitely if you stopped all of them with kill -INT during recovery
pg.write_queue.erase(st_it, it);
for (auto op: cancel_ops)
{
finish_op(op, retval);
}
}
}