// Copyright (c) Vitaliy Filippov, 2019+ // License: VNPL-1.0 (see README.md for details) #include #include "osd_peering_pg.h" struct obj_ver_role { object_id oid; uint64_t version; uint64_t osd_num; bool is_stable; }; inline bool operator < (const obj_ver_role & a, const obj_ver_role & b) { // ORDER BY inode ASC, stripe & ~STRIPE_MASK ASC, version DESC, role ASC, osd_num ASC return a.oid.inode < b.oid.inode || a.oid.inode == b.oid.inode && ( (a.oid.stripe & ~STRIPE_MASK) < (b.oid.stripe & ~STRIPE_MASK) || (a.oid.stripe & ~STRIPE_MASK) == (b.oid.stripe & ~STRIPE_MASK) && ( a.version > b.version || a.version == b.version && ( a.oid.stripe < b.oid.stripe || a.oid.stripe == b.oid.stripe && a.osd_num < b.osd_num ) ) ); } struct obj_piece_ver_t { uint64_t max_ver = 0; uint64_t stable_ver = 0; uint64_t max_target = 0; }; struct pg_obj_state_check_t { pg_t *pg; bool replicated = false; std::vector list; int list_pos; int obj_start = 0, obj_end = 0, ver_start = 0, ver_end = 0; object_id oid = { 0 }; uint64_t max_ver = 0; uint64_t last_ver = 0; uint64_t target_ver = 0; uint64_t n_copies = 0, has_roles = 0, n_roles = 0, n_stable = 0, n_mismatched = 0; uint64_t n_unstable = 0, n_invalid = 0; pg_osd_set_t osd_set; int log_level; void walk(); void start_object(); void handle_version(); void finish_object(); }; void pg_obj_state_check_t::walk() { pg->clean_count = 0; pg->total_count = 0; pg->state = 0; for (list_pos = 0; list_pos < list.size(); list_pos++) { if (oid.inode != list[list_pos].oid.inode || oid.stripe != (list[list_pos].oid.stripe & ~STRIPE_MASK)) { if (oid.inode != 0) { finish_object(); } start_object(); } handle_version(); } if (oid.inode != 0) { finish_object(); } if (pg->state & PG_HAS_INVALID) { // Stop PGs with "invalid" objects pg->state = PG_INCOMPLETE | PG_HAS_INVALID; return; } if (pg->pg_cursize < pg->pg_size) { pg->state |= PG_DEGRADED; } pg->state |= PG_ACTIVE; if (pg->state == PG_ACTIVE && pg->cur_peers.size() < pg->all_peers.size()) { pg->state |= PG_LEFT_ON_DEAD; } } void pg_obj_state_check_t::start_object() { obj_start = list_pos; oid = { .inode = list[list_pos].oid.inode, .stripe = list[list_pos].oid.stripe & ~STRIPE_MASK }; last_ver = max_ver = list[list_pos].version; target_ver = 0; ver_start = list_pos; has_roles = n_copies = n_roles = n_stable = n_mismatched = 0; n_unstable = n_invalid = 0; } void pg_obj_state_check_t::handle_version() { if (!target_ver && last_ver != list[list_pos].version && (n_stable > 0 || n_roles >= pg->pg_minsize)) { // Version is either stable or recoverable target_ver = last_ver; ver_end = list_pos; } if (!target_ver) { if (last_ver != list[list_pos].version) { ver_start = list_pos; has_roles = n_copies = n_roles = n_stable = n_mismatched = 0; last_ver = list[list_pos].version; } unsigned replica = (list[list_pos].oid.stripe & STRIPE_MASK); n_copies++; if (replicated && replica > 0 || replica >= pg->pg_size) { n_invalid++; } else { if (list[list_pos].is_stable) { n_stable++; } if (replicated) { int i; for (i = 0; i < pg->cur_set.size(); i++) { if (pg->cur_set[i] == list[list_pos].osd_num) { break; } } if (i == pg->cur_set.size()) { n_mismatched++; } } else { if (pg->cur_set[replica] != list[list_pos].osd_num) { n_mismatched++; } if (!(has_roles & (1 << replica))) { has_roles = has_roles | (1 << replica); n_roles++; } } } } if (!list[list_pos].is_stable) { n_unstable++; } } void pg_obj_state_check_t::finish_object() { if (!target_ver && (n_stable > 0 || n_roles >= pg->pg_minsize)) { // Version is either stable or recoverable target_ver = last_ver; ver_end = list_pos; } obj_end = list_pos; // Remember the decision uint64_t state = 0; if (n_invalid > 0) { // It's not allowed to change the replication scheme for a pool other than by recreating it // So we must bring the PG offline state = OBJ_INCOMPLETE; pg->state |= PG_HAS_INVALID; pg->total_count++; return; } if (n_unstable > 0) { pg->state |= PG_HAS_UNCLEAN; std::unordered_map pieces; for (int i = obj_start; i < obj_end; i++) { auto & pcs = pieces[(obj_piece_id_t){ .oid = list[i].oid, .osd_num = list[i].osd_num }]; if (!pcs.max_ver) { pcs.max_ver = list[i].version; } if (list[i].is_stable && !pcs.stable_ver) { pcs.stable_ver = list[i].version; } if (list[i].version <= target_ver && !pcs.max_target) { pcs.max_target = list[i].version; } } for (auto pp: pieces) { auto & pcs = pp.second; if (pcs.stable_ver < pcs.max_ver) { auto & act = pg->flush_actions[pp.first]; // osd_set doesn't include rollback/stable states, so don't include them in the state code either if (pcs.max_ver > target_ver) { act.rollback = true; act.rollback_to = pcs.max_target; } if (pcs.stable_ver < (pcs.max_ver > target_ver ? pcs.max_target : pcs.max_ver)) { act.make_stable = true; act.stable_to = pcs.max_ver > target_ver ? pcs.max_target : pcs.max_ver; } } } } if (!target_ver) { return; } if (!replicated && n_roles < pg->pg_minsize) { if (log_level > 1) { printf("Object is incomplete: %lx:%lx version=%lu/%lu\n", oid.inode, oid.stripe, target_ver, max_ver); } state = OBJ_INCOMPLETE; pg->state = pg->state | PG_HAS_INCOMPLETE; } else if ((replicated ? n_copies : n_roles) < pg->pg_cursize) { if (log_level > 1) { printf("Object is degraded: %lx:%lx version=%lu/%lu\n", oid.inode, oid.stripe, target_ver, max_ver); } state = OBJ_DEGRADED; pg->state = pg->state | PG_HAS_DEGRADED; } else if (n_mismatched > 0) { if (log_level > 2 && (replicated || n_roles >= pg->pg_cursize)) { printf("Object is misplaced: %lx:%lx version=%lu/%lu\n", oid.inode, oid.stripe, target_ver, max_ver); } state |= OBJ_MISPLACED; pg->state = pg->state | PG_HAS_MISPLACED; } if (log_level > 1 && (state & (OBJ_INCOMPLETE | OBJ_DEGRADED)) || log_level > 2 && (state & OBJ_MISPLACED)) { for (int i = obj_start; i < obj_end; i++) { printf("v%lu present on: osd %lu, role %ld%s\n", list[i].version, list[i].osd_num, (list[i].oid.stripe & STRIPE_MASK), list[i].is_stable ? " (stable)" : ""); } } pg->total_count++; if (state != 0 || ver_end < obj_end) { osd_set.clear(); for (int i = ver_start; i < ver_end; i++) { osd_set.push_back((pg_obj_loc_t){ .role = (list[i].oid.stripe & STRIPE_MASK), .osd_num = list[i].osd_num, .outdated = false, }); } } if (ver_end < obj_end) { // Check for outdated versions not present in the current target OSD set for (int i = ver_end; i < obj_end; i++) { int j; for (j = 0; j < osd_set.size(); j++) { if (osd_set[j].osd_num == list[i].osd_num) { break; } } if (j >= osd_set.size() && pg->cur_set[list[i].oid.stripe & STRIPE_MASK] != list[i].osd_num) { osd_set.push_back((pg_obj_loc_t){ .role = (list[i].oid.stripe & STRIPE_MASK), .osd_num = list[i].osd_num, .outdated = true, }); if (!(state & (OBJ_INCOMPLETE | OBJ_DEGRADED))) { state |= OBJ_MISPLACED; pg->state = pg->state | PG_HAS_MISPLACED; } } } } if (target_ver < max_ver) { pg->ver_override[oid] = target_ver; } if (state == 0) { pg->clean_count++; } else { auto it = pg->state_dict.find(osd_set); if (it == pg->state_dict.end()) { std::vector read_target; if (replicated) { for (auto & o: osd_set) { if (!o.outdated) { read_target.push_back(o.osd_num); } } while (read_target.size() < pg->pg_size) { // FIXME: This is because we then use .data() and assume it's at least long read_target.push_back(0); } } else { read_target.resize(pg->pg_size); for (int i = 0; i < pg->pg_size; i++) { read_target[i] = 0; } for (auto & o: osd_set) { if (!o.outdated) { read_target[o.role] = o.osd_num; } } } pg->state_dict[osd_set] = { .read_target = read_target, .osd_set = osd_set, .state = state, .object_count = 1, }; it = pg->state_dict.find(osd_set); } else { it->second.object_count++; } if (state & OBJ_INCOMPLETE) { pg->incomplete_objects[oid] = &it->second; } else if (state & OBJ_DEGRADED) { pg->degraded_objects[oid] = &it->second; } else { pg->misplaced_objects[oid] = &it->second; } } } // FIXME: Write at least some tests for this function void pg_t::calc_object_states(int log_level) { // Copy all object lists into one array pg_obj_state_check_t st; st.log_level = log_level; st.pg = this; st.replicated = (this->scheme == POOL_SCHEME_REPLICATED); auto ps = peering_state; epoch = 0; for (auto it: ps->list_results) { auto nstab = it.second.stable_count; auto n = it.second.total_count; auto osd_num = it.first; uint64_t start = st.list.size(); st.list.resize(start + n); obj_ver_id *ov = it.second.buf; for (uint64_t i = 0; i < n; i++, ov++) { if ((ov->version >> (64-PG_EPOCH_BITS)) > epoch) { epoch = (ov->version >> (64-PG_EPOCH_BITS)); } st.list[start+i] = { .oid = ov->oid, .version = ov->version, .osd_num = osd_num, .is_stable = i < nstab, }; } free(it.second.buf); it.second.buf = NULL; } ps->list_results.clear(); // Sort std::sort(st.list.begin(), st.list.end()); // Walk over it and check object states st.walk(); if (this->state & (PG_DEGRADED|PG_LEFT_ON_DEAD)) { assert(epoch != ((1ul << PG_EPOCH_BITS)-1)); epoch++; } } void pg_t::print_state() { printf( "[PG %u/%u] is %s%s%s%s%s%s%s%s%s%s%s%s%s (%lu objects)\n", pool_id, pg_num, (state & PG_STARTING) ? "starting" : "", (state & PG_OFFLINE) ? "offline" : "", (state & PG_PEERING) ? "peering" : "", (state & PG_INCOMPLETE) ? "incomplete" : "", (state & PG_ACTIVE) ? "active" : "", (state & PG_STOPPING) ? "stopping" : "", (state & PG_DEGRADED) ? " + degraded" : "", (state & PG_HAS_INCOMPLETE) ? " + has_incomplete" : "", (state & PG_HAS_DEGRADED) ? " + has_degraded" : "", (state & PG_HAS_MISPLACED) ? " + has_misplaced" : "", (state & PG_HAS_UNCLEAN) ? " + has_unclean" : "", (state & PG_HAS_INVALID) ? " + has_invalid" : "", (state & PG_LEFT_ON_DEAD) ? " + left_on_dead" : "", total_count ); }