Report PG history synchronously during write

This has 2 effects:
1) OSD sets aren't added into PG history until actual write attempts anymore
   which removes unneeded extra osd_sets in PG history
2) New OSD sets are reported synchronously and can't be lost on PG restarts
   happening at the same time with reconfiguration
rm-left-on-dead
Vitaliy Filippov 2023-01-01 23:41:02 +03:00
parent 37a6aff2fa
commit a4dfa519af
3 changed files with 23 additions and 18 deletions

View File

@ -885,7 +885,6 @@ void osd_t::report_pg_states()
if (pg.history_changed)
{
// Prevent race conditions (for the case when the monitor is updating this key at the same time)
// FIXME: target_history updates may be lost on PG re-peering
pg.history_changed = false;
std::string history_key = base64_encode(st_cli.etcd_prefix+"/pg/history/"+std::to_string(pg.pool_id)+"/"+std::to_string(pg.pg_num));
json11::Json::object history_value = {

View File

@ -86,21 +86,9 @@ void pg_obj_state_check_t::walk()
}
if (pg->pg_cursize < pg->pg_size)
{
// Report PG history and activate
// Activate as degraded
// Current OSD set will be added into target_history on first write
pg->state |= PG_DEGRADED | PG_PEERED;
std::vector<osd_num_t> history_set;
for (auto peer_osd: pg->cur_set)
{
if (peer_osd != 0)
history_set.push_back(peer_osd);
}
std::sort(history_set.begin(), history_set.end());
auto it = std::lower_bound(pg->target_history.begin(), pg->target_history.end(), history_set);
if (it == pg->target_history.end() || *it != history_set)
{
pg->target_history.insert(it, history_set);
pg->history_changed = true;
}
}
else
{
@ -438,7 +426,7 @@ void pg_t::calc_object_states(int log_level)
std::sort(st.list.begin(), st.list.end());
// Walk over it and check object states
st.walk();
if (this->state & (PG_DEGRADED|PG_LEFT_ON_DEAD))
if (this->state != PG_ACTIVE)
{
assert(epoch != (((uint64_t)1 << PG_EPOCH_BITS)-1));
epoch++;

View File

@ -155,9 +155,21 @@ resume_3:
if (pg.epoch > pg.reported_epoch)
{
// Report newer epoch before writing
// FIXME: We may report only one PG state here...
// FIXME: We don't have to report all changed PG states here
this->pg_state_dirty.insert({ .pool_id = pg.pool_id, .pg_num = pg.pg_num });
pg.history_changed = true;
if (pg.state != PG_ACTIVE)
{
// Check that current OSD set is in history and/or add it there
std::vector<osd_num_t> history_set;
for (auto peer_osd: pg.cur_set)
if (peer_osd != 0)
history_set.push_back(peer_osd);
std::sort(history_set.begin(), history_set.end());
auto it = std::lower_bound(pg.target_history.begin(), pg.target_history.end(), history_set);
if (it == pg.target_history.end() || *it != history_set)
pg.target_history.insert(it, history_set);
pg.history_changed = true;
}
report_pg_states();
resume_10:
if (pg.epoch > pg.reported_epoch)
@ -166,6 +178,12 @@ resume_10:
return;
}
}
// Recheck PG state after reporting history - maybe it's already stopping/restarting
if (pg.state & (PG_STOPPING|PG_REPEERING))
{
pg_cancel_write_queue(pg, cur_op, op_data->oid, -EPIPE);
return;
}
submit_primary_subops(SUBMIT_WRITE, op_data->target_ver, pg.cur_set.data(), cur_op);
resume_4:
op_data->st = 4;