Fix unstable flushing, include extra OSDs with old object versions in osd_set

trace-sqes
Vitaliy Filippov 2020-03-23 15:44:29 +03:00
parent fd8e1a8418
commit 036f4c5bf3
3 changed files with 121 additions and 93 deletions

View File

@ -90,6 +90,10 @@ void osd_t::handle_flush_op(pg_num_t pg_num, pg_flush_batch_t *fb, osd_num_t osd
it->first.oid.inode != prev_it->first.oid.inode || it->first.oid.inode != prev_it->first.oid.inode ||
(it->first.oid.stripe & ~STRIPE_MASK) != (prev_it->first.oid.stripe & ~STRIPE_MASK)) (it->first.oid.stripe & ~STRIPE_MASK) != (prev_it->first.oid.stripe & ~STRIPE_MASK))
{ {
pg.ver_override.erase((object_id){
.inode = prev_it->first.oid.inode,
.stripe = (prev_it->first.oid.stripe & ~STRIPE_MASK),
});
auto wr_it = pg.write_queue.find((object_id){ auto wr_it = pg.write_queue.find((object_id){
.inode = prev_it->first.oid.inode, .inode = prev_it->first.oid.inode,
.stripe = (prev_it->first.oid.stripe & ~STRIPE_MASK), .stripe = (prev_it->first.oid.stripe & ~STRIPE_MASK),

View File

@ -23,20 +23,21 @@ struct obj_piece_ver_t
{ {
uint64_t max_ver = 0; uint64_t max_ver = 0;
uint64_t stable_ver = 0; uint64_t stable_ver = 0;
uint64_t max_target = 0;
}; };
struct pg_obj_state_check_t struct pg_obj_state_check_t
{ {
pg_t *pg; pg_t *pg;
int i;
std::vector<obj_ver_role> list; std::vector<obj_ver_role> list;
int list_pos;
int obj_start = 0, obj_end = 0, ver_start = 0, ver_end = 0; int obj_start = 0, obj_end = 0, ver_start = 0, ver_end = 0;
object_id oid = { 0 }; object_id oid = { 0 };
uint64_t max_ver = 0; uint64_t max_ver = 0;
uint64_t last_ver = 0; uint64_t last_ver = 0;
uint64_t target_ver = 0; uint64_t target_ver = 0;
uint64_t n_copies = 0, has_roles = 0, n_roles = 0, n_stable = 0, n_mismatched = 0; uint64_t n_copies = 0, has_roles = 0, n_roles = 0, n_stable = 0, n_mismatched = 0;
bool is_buggy = false, has_old_unstable = false; uint64_t n_unstable = 0, n_buggy = 0;
pg_osd_set_t osd_set; pg_osd_set_t osd_set;
void walk(); void walk();
@ -50,10 +51,10 @@ void pg_obj_state_check_t::walk()
pg->clean_count = 0; pg->clean_count = 0;
pg->total_count = 0; pg->total_count = 0;
pg->state = 0; pg->state = 0;
for (i = 0; i < list.size(); i++) for (list_pos = 0; list_pos < list.size(); list_pos++)
{ {
if (oid.inode != list[i].oid.inode || if (oid.inode != list[list_pos].oid.inode ||
oid.stripe != (list[i].oid.stripe & ~STRIPE_MASK)) oid.stripe != (list[list_pos].oid.stripe & ~STRIPE_MASK))
{ {
if (oid.inode != 0) if (oid.inode != 0)
{ {
@ -76,45 +77,45 @@ void pg_obj_state_check_t::walk()
void pg_obj_state_check_t::start_object() void pg_obj_state_check_t::start_object()
{ {
obj_start = i; obj_start = list_pos;
oid = { .inode = list[i].oid.inode, .stripe = list[i].oid.stripe & ~STRIPE_MASK }; oid = { .inode = list[list_pos].oid.inode, .stripe = list[list_pos].oid.stripe & ~STRIPE_MASK };
last_ver = max_ver = list[i].version; last_ver = max_ver = list[list_pos].version;
target_ver = 0; target_ver = 0;
ver_start = i; ver_start = list_pos;
has_roles = n_copies = n_roles = n_stable = n_mismatched = 0; has_roles = n_copies = n_roles = n_stable = n_mismatched = 0;
is_buggy = false; n_unstable = n_buggy = 0;
} }
void pg_obj_state_check_t::handle_version() void pg_obj_state_check_t::handle_version()
{ {
if (!target_ver && last_ver != list[i].version && (n_stable > 0 || n_roles >= pg->pg_minsize)) if (!target_ver && last_ver != list[list_pos].version && (n_stable > 0 || n_roles >= pg->pg_minsize))
{ {
// Version is either stable or recoverable // Version is either stable or recoverable
target_ver = last_ver; target_ver = last_ver;
ver_end = i; ver_end = list_pos;
} }
if (!target_ver) if (!target_ver)
{ {
if (last_ver != list[i].version) if (last_ver != list[list_pos].version)
{ {
ver_start = i; ver_start = list_pos;
has_roles = n_copies = n_roles = n_stable = n_mismatched = 0; has_roles = n_copies = n_roles = n_stable = n_mismatched = 0;
last_ver = list[i].version; last_ver = list[list_pos].version;
} }
int replica = (list[i].oid.stripe & STRIPE_MASK); int replica = (list[list_pos].oid.stripe & STRIPE_MASK);
n_copies++; n_copies++;
if (replica >= pg->pg_size) if (replica >= pg->pg_size)
{ {
// FIXME In the future, check it against the PG epoch number to handle replication factor/scheme changes // FIXME In the future, check it against the PG epoch number to handle replication factor/scheme changes
is_buggy = true; n_buggy++;
} }
else else
{ {
if (list[i].is_stable) if (list[list_pos].is_stable)
{ {
n_stable++; n_stable++;
} }
if (pg->cur_set[replica] != list[i].osd_num) if (pg->cur_set[replica] != list[list_pos].osd_num)
{ {
n_mismatched++; n_mismatched++;
} }
@ -125,9 +126,9 @@ void pg_obj_state_check_t::handle_version()
} }
} }
} }
else if (!list[i].is_stable) if (!list[list_pos].is_stable)
{ {
has_old_unstable = true; n_unstable++;
} }
} }
@ -137,11 +138,17 @@ void pg_obj_state_check_t::finish_object()
{ {
// Version is either stable or recoverable // Version is either stable or recoverable
target_ver = last_ver; target_ver = last_ver;
ver_end = i; ver_end = list_pos;
} }
obj_end = i; obj_end = list_pos;
// Remember the decision // Remember the decision
uint64_t state = OBJ_CLEAN; uint64_t state = OBJ_CLEAN;
if (n_buggy > 0)
{
state = OBJ_BUGGY;
// FIXME: bring pg offline
throw std::runtime_error("buggy object state");
}
if (target_ver > 0) if (target_ver > 0)
{ {
if (n_roles < pg->pg_minsize) if (n_roles < pg->pg_minsize)
@ -169,29 +176,51 @@ void pg_obj_state_check_t::finish_object()
state |= OBJ_MISPLACED; state |= OBJ_MISPLACED;
pg->state = pg->state | PG_HAS_MISPLACED; pg->state = pg->state | PG_HAS_MISPLACED;
} }
if (n_stable < n_copies) pg->total_count++;
}
if (n_unstable > 0)
{
pg->state |= PG_HAS_UNCLEAN;
std::unordered_map<obj_piece_id_t, obj_piece_ver_t> pieces;
for (int i = obj_start; i < obj_end; i++)
{ {
state |= OBJ_NEEDS_STABLE; auto & pcs = pieces[(obj_piece_id_t){ .oid = list[i].oid, .osd_num = list[i].osd_num }];
pg->state = pg->state | PG_HAS_UNCLEAN; if (!pcs.max_ver)
{
pcs.max_ver = list[i].version;
}
if (list[i].is_stable && !pcs.stable_ver)
{
pcs.stable_ver = list[i].version;
}
if (list[i].version <= target_ver)
{
pcs.max_target = list[i].version;
}
}
for (auto pp: pieces)
{
auto & pcs = pp.second;
if (pcs.stable_ver < pcs.max_ver)
{
auto & act = pg->flush_actions[pp.first];
// osd_set doesn't include rollback/stable states, so don't include them in the state code either
if (pcs.max_ver > target_ver)
{
//state |= OBJ_NEEDS_ROLLBACK;
act.rollback = true;
act.rollback_to = pcs.max_target;
}
if (pcs.stable_ver < (pcs.max_ver > target_ver ? pcs.max_target : pcs.max_ver))
{
//state |= OBJ_NEEDS_STABLE;
act.make_stable = true;
act.stable_to = pcs.max_ver > target_ver ? pcs.max_target : pcs.max_ver;
}
}
} }
} }
if (target_ver < max_ver || has_old_unstable) if (state != OBJ_CLEAN || ver_end < obj_end)
{
state |= OBJ_NEEDS_ROLLBACK;
pg->state = pg->state | PG_HAS_UNCLEAN;
}
if (is_buggy)
{
state |= OBJ_BUGGY;
// FIXME: bring pg offline
throw std::runtime_error("buggy object state");
}
pg->total_count++;
if (state == OBJ_CLEAN)
{
pg->clean_count++;
}
else
{ {
osd_set.clear(); osd_set.clear();
for (int i = ver_start; i < ver_end; i++) for (int i = ver_start; i < ver_end; i++)
@ -199,10 +228,45 @@ void pg_obj_state_check_t::finish_object()
osd_set.push_back((pg_obj_loc_t){ osd_set.push_back((pg_obj_loc_t){
.role = (list[i].oid.stripe & STRIPE_MASK), .role = (list[i].oid.stripe & STRIPE_MASK),
.osd_num = list[i].osd_num, .osd_num = list[i].osd_num,
.stable = list[i].is_stable, .outdated = false,
}); });
} }
std::sort(osd_set.begin(), osd_set.end()); }
if (ver_end < obj_end)
{
// Check for outdated versions not present in the current target OSD set
for (int i = ver_end; i < obj_end; i++)
{
int j;
for (j = 0; j < osd_set.size(); j++)
{
if (osd_set[j].osd_num == list[i].osd_num)
{
break;
}
}
if (j >= osd_set.size() && pg->cur_set[list[i].oid.stripe & STRIPE_MASK] != list[i].osd_num)
{
osd_set.push_back((pg_obj_loc_t){
.role = (list[i].oid.stripe & STRIPE_MASK),
.osd_num = list[i].osd_num,
.outdated = true,
});
state |= OBJ_MISPLACED;
pg->state = pg->state | PG_HAS_MISPLACED;
}
}
}
if (target_ver < max_ver)
{
pg->ver_override[oid] = target_ver;
}
if (state == OBJ_CLEAN)
{
pg->clean_count++;
}
else
{
auto it = pg->state_dict.find(osd_set); auto it = pg->state_dict.find(osd_set);
if (it == pg->state_dict.end()) if (it == pg->state_dict.end())
{ {
@ -214,7 +278,10 @@ void pg_obj_state_check_t::finish_object()
} }
for (auto & o: osd_set) for (auto & o: osd_set)
{ {
read_target[o.role] = o.osd_num; if (!o.outdated)
{
read_target[o.role] = o.osd_num;
}
} }
pg->state_dict[osd_set] = { pg->state_dict[osd_set] = {
.read_target = read_target, .read_target = read_target,
@ -229,49 +296,6 @@ void pg_obj_state_check_t::finish_object()
it->second.object_count++; it->second.object_count++;
} }
pg->obj_states[oid] = &it->second; pg->obj_states[oid] = &it->second;
if (target_ver < max_ver)
{
pg->ver_override[oid] = target_ver;
}
if (state & (OBJ_NEEDS_ROLLBACK | OBJ_NEEDS_STABLE))
{
std::unordered_map<obj_piece_id_t, obj_piece_ver_t> pieces;
for (int i = obj_start; i < obj_end; i++)
{
auto & pcs = pieces[(obj_piece_id_t){ .oid = list[i].oid, .osd_num = list[i].osd_num }];
if (!pcs.max_ver)
{
pcs.max_ver = list[i].version;
}
if (list[i].is_stable && !pcs.stable_ver)
{
pcs.stable_ver = list[i].version;
}
}
for (auto pp: pieces)
{
auto & pcs = pp.second;
if (pcs.stable_ver < pcs.max_ver)
{
auto & act = pg->flush_actions[pp.first];
if (pcs.max_ver > target_ver)
{
act.rollback = true;
act.rollback_to = target_ver;
}
else if (pcs.max_ver < target_ver && pcs.stable_ver < pcs.max_ver)
{
act.rollback = true;
act.rollback_to = pcs.stable_ver;
}
if (pcs.max_ver >= target_ver && pcs.stable_ver < target_ver)
{
act.make_stable = true;
act.stable_to = target_ver;
}
}
}
}
} }
} }

View File

@ -37,7 +37,7 @@ struct pg_obj_loc_t
{ {
uint64_t role; uint64_t role;
osd_num_t osd_num; osd_num_t osd_num;
bool stable; bool outdated;
}; };
typedef std::vector<pg_obj_loc_t> pg_osd_set_t; typedef std::vector<pg_obj_loc_t> pg_osd_set_t;
@ -121,8 +121,9 @@ struct pg_t
inline bool operator < (const pg_obj_loc_t &a, const pg_obj_loc_t &b) inline bool operator < (const pg_obj_loc_t &a, const pg_obj_loc_t &b)
{ {
return a.role < b.role || a.role == b.role && a.osd_num < b.osd_num || return a.role < b.role ||
a.role == b.role && a.osd_num == b.osd_num && a.stable < b.stable; a.role == b.role && a.outdated < b.outdated ||
a.role == b.role && a.outdated == b.outdated && a.osd_num < b.osd_num;
} }
inline bool operator == (const obj_piece_id_t & a, const obj_piece_id_t & b) inline bool operator == (const obj_piece_id_t & a, const obj_piece_id_t & b)
@ -147,7 +148,6 @@ namespace std
// Copy-pasted from spp::hash_combine() // Copy-pasted from spp::hash_combine()
seed ^= (e.role + 0xc6a4a7935bd1e995 + (seed << 6) + (seed >> 2)); seed ^= (e.role + 0xc6a4a7935bd1e995 + (seed << 6) + (seed >> 2));
seed ^= (e.osd_num + 0xc6a4a7935bd1e995 + (seed << 6) + (seed >> 2)); seed ^= (e.osd_num + 0xc6a4a7935bd1e995 + (seed << 6) + (seed >> 2));
seed ^= ((e.stable ? 1 : 0) + 0xc6a4a7935bd1e995 + (seed << 6) + (seed >> 2));
} }
return seed; return seed;
} }