Browse Source

Fix infinite looping in continue_recovery_op() when pg_cancel_write_queue() is called

tags/v0.5.1
Vitaliy Filippov 2 months ago
parent
commit
738ad5af79
2 changed files with 21 additions and 8 deletions
  1. +9
    -0
      osd_flush.cpp
  2. +12
    -8
      osd_primary_subops.cpp

+ 9
- 0
osd_flush.cpp View File

@@ -258,6 +258,10 @@ void osd_t::submit_recovery_op(osd_recovery_op_t *op)
.len = 0,
},
};
if (log_level > 2)
{
printf("Submitting recovery operation for %lx:%lx\n", op->oid.inode, op->oid.stripe);
}
op->osd_op->callback = [this, op](osd_op_t *osd_op)
{
// Don't sync the write, it will be synced by our regular sync coroutine
@@ -267,6 +271,11 @@ void osd_t::submit_recovery_op(osd_recovery_op_t *op)
if (osd_op->reply.hdr.retval == -EPIPE)
{
// PG is stopped or one of the OSDs is gone, error is harmless
printf(
"Recovery operation failed with object %lx:%lx (PG %u/%u)\n",
op->oid.inode, op->oid.stripe, INODE_POOL(op->oid.inode),
map_to_pg(op->oid, st_cli.pool_config.at(INODE_POOL(op->oid.inode)).pg_stripe_size)
);
}
else
{


+ 12
- 8
osd_primary_subops.cpp View File

@@ -552,24 +552,28 @@ void osd_t::submit_primary_stab_subops(osd_op_t *cur_op)
void osd_t::pg_cancel_write_queue(pg_t & pg, osd_op_t *first_op, object_id oid, int retval)
{
auto st_it = pg.write_queue.find(oid), it = st_it;
finish_op(first_op, retval);
if (it != pg.write_queue.end() && it->second == first_op)
{
it++;
}
else
if (it == pg.write_queue.end() || it->second != first_op)
{
// Write queue doesn't match the first operation.
// first_op is a leftover operation from the previous peering of the same PG.
finish_op(first_op, retval);
return;
}
while (it != pg.write_queue.end() && it->first == oid)
std::vector<osd_op_t*> cancel_ops;
while (it != pg.write_queue.end())
{
finish_op(it->second, retval);
cancel_ops.push_back(it->second);
it++;
}
if (st_it != it)
{
// First erase them and then run finish_op() for the sake of reenterability
// Calling finish_op() on a live iterator previously triggered a bug where some
// of the OSDs were looping infinitely if you stopped all of them with kill -INT during recovery
pg.write_queue.erase(st_it, it);
for (auto op: cancel_ops)
{
finish_op(op, retval);
}
}
}

Loading…
Cancel
Save