Fix infinite looping in continue_recovery_op() when pg_cancel_write_queue() is called
parent
9abf3c17c9
commit
738ad5af79
|
@ -258,6 +258,10 @@ void osd_t::submit_recovery_op(osd_recovery_op_t *op)
|
|||
.len = 0,
|
||||
},
|
||||
};
|
||||
if (log_level > 2)
|
||||
{
|
||||
printf("Submitting recovery operation for %lx:%lx\n", op->oid.inode, op->oid.stripe);
|
||||
}
|
||||
op->osd_op->callback = [this, op](osd_op_t *osd_op)
|
||||
{
|
||||
// Don't sync the write, it will be synced by our regular sync coroutine
|
||||
|
@ -267,6 +271,11 @@ void osd_t::submit_recovery_op(osd_recovery_op_t *op)
|
|||
if (osd_op->reply.hdr.retval == -EPIPE)
|
||||
{
|
||||
// PG is stopped or one of the OSDs is gone, error is harmless
|
||||
printf(
|
||||
"Recovery operation failed with object %lx:%lx (PG %u/%u)\n",
|
||||
op->oid.inode, op->oid.stripe, INODE_POOL(op->oid.inode),
|
||||
map_to_pg(op->oid, st_cli.pool_config.at(INODE_POOL(op->oid.inode)).pg_stripe_size)
|
||||
);
|
||||
}
|
||||
else
|
||||
{
|
||||
|
|
|
@ -552,24 +552,28 @@ void osd_t::submit_primary_stab_subops(osd_op_t *cur_op)
|
|||
void osd_t::pg_cancel_write_queue(pg_t & pg, osd_op_t *first_op, object_id oid, int retval)
|
||||
{
|
||||
auto st_it = pg.write_queue.find(oid), it = st_it;
|
||||
finish_op(first_op, retval);
|
||||
if (it != pg.write_queue.end() && it->second == first_op)
|
||||
{
|
||||
it++;
|
||||
}
|
||||
else
|
||||
if (it == pg.write_queue.end() || it->second != first_op)
|
||||
{
|
||||
// Write queue doesn't match the first operation.
|
||||
// first_op is a leftover operation from the previous peering of the same PG.
|
||||
finish_op(first_op, retval);
|
||||
return;
|
||||
}
|
||||
while (it != pg.write_queue.end() && it->first == oid)
|
||||
std::vector<osd_op_t*> cancel_ops;
|
||||
while (it != pg.write_queue.end())
|
||||
{
|
||||
finish_op(it->second, retval);
|
||||
cancel_ops.push_back(it->second);
|
||||
it++;
|
||||
}
|
||||
if (st_it != it)
|
||||
{
|
||||
// First erase them and then run finish_op() for the sake of reenterability
|
||||
// Calling finish_op() on a live iterator previously triggered a bug where some
|
||||
// of the OSDs were looping infinitely if you stopped all of them with kill -INT during recovery
|
||||
pg.write_queue.erase(st_it, it);
|
||||
for (auto op: cancel_ops)
|
||||
{
|
||||
finish_op(op, retval);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Caricamento…
Fai riferimento in un nuovo problema