Compare commits

...

1 Commits

Author SHA1 Message Date
Vitaliy Filippov 4c9bf6727b Experimental: Handle degraded deletions by comparing object versions with epochs
CAUTION! This version is not fool proof yet. If you purge data of an OSD by
overwriting the disk with zeroes and restart it then the same data will also
be removed from other replicas :-).

I plan to add protection from this situation before merging it into master.
The idea is to make each OSD store a random "cookie" on disk and remove itself
from history automatically if the cookie doesn't match.
2023-04-29 00:21:22 +03:00
20 changed files with 553 additions and 105 deletions

View File

@ -10,18 +10,25 @@ function add_pg_history(new_pg_history, new_pg, prev_pgs, prev_pg_history, old_p
if (!new_pg_history[new_pg]) if (!new_pg_history[new_pg])
{ {
new_pg_history[new_pg] = { new_pg_history[new_pg] = {
osd_sets: {}, osd_set_epochs: {},
all_peers: {}, all_peers: {},
epoch: 0, epoch: 0,
}; };
} }
const nh = new_pg_history[new_pg], oh = prev_pg_history[old_pg]; const nh = new_pg_history[new_pg], oh = prev_pg_history[old_pg];
nh.osd_sets[prev_pgs[old_pg].join(' ')] = prev_pgs[old_pg]; nh.osd_set_epochs[prev_pgs[old_pg].join(' ')] = { osd_set: prev_pgs[old_pg] };
if (oh && oh.osd_sets && oh.osd_sets.length) if (oh && oh.osd_sets && oh.osd_sets.length)
{ {
for (const pg of oh.osd_sets) for (const pg of oh.osd_sets)
{ {
nh.osd_sets[pg.join(' ')] = pg.map(osd_num => Number(osd_num)); nh.osd_set_epochs[pg.join(' ')] = { osd_set: pg.map(osd_num => Number(osd_num)) };
}
}
if (oh && oh.osd_set_epochs && oh.osd_set_epochs.length)
{
for (const pg of oh.osd_set_epochs)
{
nh.osd_set_epochs[pg.osd_set.join(' ')] = { osd_set: pg.osd_set.map(osd_num => Number(osd_num)) };
} }
} }
if (oh && oh.all_peers && oh.all_peers.length) if (oh && oh.all_peers && oh.all_peers.length)
@ -39,7 +46,7 @@ function add_pg_history(new_pg_history, new_pg, prev_pgs, prev_pg_history, old_p
function finish_pg_history(merged_history) function finish_pg_history(merged_history)
{ {
merged_history.osd_sets = Object.values(merged_history.osd_sets); merged_history.osd_set_epochs = Object.values(merged_history.osd_set_epochs);
merged_history.all_peers = Object.values(merged_history.all_peers); merged_history.all_peers = Object.values(merged_history.all_peers);
} }

View File

@ -286,7 +286,12 @@ const etcd_tree = {
history: { history: {
/* <pool_id>: { /* <pool_id>: {
<pg_id>: { <pg_id>: {
osd_sets: osd_num_t[][], osd_set_epochs: {
osd_set: osd_num_t[],
min_epoch: uint64_t,
max_epoch: uint64_t,
}[],
osd_sets: osd_num_t[][], // outdated
all_peers: osd_num_t[], all_peers: osd_num_t[],
epoch: uint64_t, epoch: uint64_t,
}, },
@ -968,18 +973,6 @@ class Mon
osd_set, osd_set,
primary: this.pick_primary(pool_id, osd_set, up_osds, aff_osds), primary: this.pick_primary(pool_id, osd_set, up_osds, aff_osds),
}; };
if (prev_pgs[i] && prev_pgs[i].join(' ') != osd_set.join(' ') &&
prev_pgs[i].filter(osd_num => osd_num).length > 0)
{
pg_history[i] = pg_history[i] || {};
pg_history[i].osd_sets = pg_history[i].osd_sets || [];
pg_history[i].osd_sets.push(prev_pgs[i]);
}
if (pg_history[i] && pg_history[i].osd_sets)
{
pg_history[i].osd_sets = Object.values(pg_history[i].osd_sets
.reduce((a, c) => { a[c.join(' ')] = c; return a; }, {}));
}
}); });
for (let i = 0; i < new_pgs.length || i < prev_pgs.length; i++) for (let i = 0; i < new_pgs.length || i < prev_pgs.length; i++)
{ {

View File

@ -88,7 +88,7 @@ struct rm_osd_t
for (auto & hist_item: pg_cfg.target_history) for (auto & hist_item: pg_cfg.target_history)
{ {
int hist_size = 0, hist_rm = 0; int hist_size = 0, hist_rm = 0;
for (auto & old_osd: hist_item) for (auto & old_osd: hist_item.osd_set)
{ {
if (old_osd != 0) if (old_osd != 0)
{ {
@ -382,7 +382,7 @@ struct rm_osd_t
for (int i = 0; i < pg_cfg.target_history.size(); i++) for (int i = 0; i < pg_cfg.target_history.size(); i++)
{ {
int hist_size = 0, hist_rm = 0; int hist_size = 0, hist_rm = 0;
for (auto & old_osd: pg_cfg.target_history[i]) for (auto & old_osd: pg_cfg.target_history[i].osd_set)
{ {
if (old_osd != 0) if (old_osd != 0)
{ {
@ -406,6 +406,15 @@ struct rm_osd_t
} }
if (update_pg_history) if (update_pg_history)
{ {
json11::Json::array target_history;
for (auto & pgh: pg_cfg.target_history)
{
target_history.push_back(json11::Json::object {
{ "osd_set", pgh.osd_set },
{ "min_epoch", pgh.min_epoch },
{ "max_epoch", pgh.max_epoch },
});
}
std::string history_key = base64_encode( std::string history_key = base64_encode(
parent->cli->st_cli.etcd_prefix+"/pg/history/"+ parent->cli->st_cli.etcd_prefix+"/pg/history/"+
std::to_string(pool_cfg.id)+"/"+std::to_string(pg_num) std::to_string(pool_cfg.id)+"/"+std::to_string(pg_num)
@ -416,7 +425,7 @@ struct rm_osd_t
{ "value", base64_encode(json11::Json(json11::Json::object { { "value", base64_encode(json11::Json(json11::Json::object {
{ "epoch", pg_cfg.epoch }, { "epoch", pg_cfg.epoch },
{ "all_peers", pg_cfg.all_peers }, { "all_peers", pg_cfg.all_peers },
{ "osd_sets", pg_cfg.target_history }, { "osd_set_epochs", target_history },
}).dump()) }, }).dump()) },
} }, } },
}); });

View File

@ -96,7 +96,7 @@ inode_list_t* cluster_client_t::list_inode_start(inode_t inode,
} }
for (auto & hist_item: pg.target_history) for (auto & hist_item: pg.target_history)
{ {
for (auto pg_osd: hist_item) for (auto pg_osd: hist_item.osd_set)
{ {
if (pg_osd != 0) if (pg_osd != 0)
{ {
@ -106,11 +106,14 @@ inode_list_t* cluster_client_t::list_inode_start(inode_t inode,
} }
for (osd_num_t peer_osd: all_peers) for (osd_num_t peer_osd: all_peers)
{ {
r->list_osds.push_back((inode_list_osd_t){ if (st_cli.peer_states.find(peer_osd) != st_cli.peer_states.end())
.pg = r, {
.osd_num = peer_osd, r->list_osds.push_back((inode_list_osd_t){
.sent = false, .pg = r,
}); .osd_num = peer_osd,
.sent = false,
});
}
} }
} }
else else

View File

@ -902,9 +902,32 @@ void etcd_state_client_t::parse_state(const etcd_kv_t & kv)
history_set.insert(it, pg_osd_num); history_set.insert(it, pg_osd_num);
} }
} }
auto it = std::lower_bound(pg_cfg.target_history.begin(), pg_cfg.target_history.end(), history_set); pg_history_set_t epoch_set = { .osd_set = history_set };
if (it == pg_cfg.target_history.end() || *it != history_set) auto it = std::lower_bound(pg_cfg.target_history.begin(), pg_cfg.target_history.end(), epoch_set);
pg_cfg.target_history.insert(it, history_set); if (it == pg_cfg.target_history.end() || *it != epoch_set)
pg_cfg.target_history.insert(it, epoch_set);
}
// Newer format with epochs
for (auto hist_item: value["osd_set_epochs"].array_items())
{
pg_history_set_t history_set;
history_set.min_epoch = hist_item["min_epoch"].uint64_value();
history_set.max_epoch = hist_item["max_epoch"].uint64_value();
if (history_set.max_epoch < history_set.min_epoch)
{
history_set.max_epoch = 0;
history_set.min_epoch = 0;
}
for (auto pg_osd: hist_item["osd_set"].array_items())
{
history_set.osd_set.push_back(pg_osd.uint64_value());
}
if (history_set.max_epoch || history_set.osd_set.size())
{
auto it = std::lower_bound(pg_cfg.target_history.begin(), pg_cfg.target_history.end(), history_set);
if (it == pg_cfg.target_history.end() || *it != history_set)
pg_cfg.target_history.insert(it, history_set);
}
} }
// Include these additional OSDs when peering the PG // Include these additional OSDs when peering the PG
for (auto pg_osd: value["all_peers"].array_items()) for (auto pg_osd: value["all_peers"].array_items())

View File

@ -33,7 +33,7 @@ struct pg_config_t
bool exists; bool exists;
osd_num_t primary; osd_num_t primary;
std::vector<osd_num_t> target_set; std::vector<osd_num_t> target_set;
std::vector<std::vector<osd_num_t>> target_history; std::vector<pg_history_set_t> target_history;
std::vector<osd_num_t> all_peers; std::vector<osd_num_t> all_peers;
bool pause; bool pause;
osd_num_t cur_primary; osd_num_t cur_primary;

View File

@ -192,6 +192,7 @@ class osd_t
void reset_stats(); void reset_stats();
json11::Json get_statistics(); json11::Json get_statistics();
void report_statistics(); void report_statistics();
void add_pg_history(pg_t & pg);
void report_pg_state(pg_t & pg); void report_pg_state(pg_t & pg);
void report_pg_states(); void report_pg_states();
void apply_pg_count(); void apply_pg_count();

View File

@ -674,7 +674,7 @@ void osd_t::apply_pg_config()
} }
for (auto & hist_item: pg_cfg.target_history) for (auto & hist_item: pg_cfg.target_history)
{ {
for (auto pg_osd: hist_item) for (auto pg_osd: hist_item.osd_set)
{ {
if (pg_osd != 0) if (pg_osd != 0)
{ {
@ -868,11 +868,40 @@ void osd_t::report_pg_states()
// Prevent race conditions (for the case when the monitor is updating this key at the same time) // Prevent race conditions (for the case when the monitor is updating this key at the same time)
pg.history_changed = false; pg.history_changed = false;
std::string history_key = base64_encode(st_cli.etcd_prefix+"/pg/history/"+std::to_string(pg.pool_id)+"/"+std::to_string(pg.pg_num)); std::string history_key = base64_encode(st_cli.etcd_prefix+"/pg/history/"+std::to_string(pg.pool_id)+"/"+std::to_string(pg.pg_num));
json11::Json::array target_history;
for (auto & pgh: pg.target_history)
{
target_history.push_back(json11::Json::object {
{ "osd_set", pgh.osd_set },
{ "min_epoch", pgh.min_epoch },
{ "max_epoch", pgh.max_epoch },
});
}
std::vector<osd_num_t> all_peers;
for (auto peer_osd: pg.all_peers)
{
bool found = false;
for (auto target_peer: pg.target_set)
{
if (target_peer == peer_osd)
{
found = true;
break;
}
}
if (!found)
{
all_peers.push_back(peer_osd);
}
}
json11::Json::object history_value = { json11::Json::object history_value = {
{ "epoch", pg.epoch }, { "epoch", pg.epoch },
{ "all_peers", pg.all_peers }, { "osd_set_epochs", target_history },
{ "osd_sets", pg.target_history },
}; };
if (all_peers.size())
{
history_value["all_peers"] = all_peers;
}
checks.push_back(json11::Json::object { checks.push_back(json11::Json::object {
{ "target", "MOD" }, { "target", "MOD" },
{ "key", history_key }, { "key", history_key },

View File

@ -287,6 +287,25 @@ bool osd_t::pick_next_recovery(osd_recovery_op_t &op)
void osd_t::submit_recovery_op(osd_recovery_op_t *op) void osd_t::submit_recovery_op(osd_recovery_op_t *op)
{ {
// Check if the object is deleted
bool is_deleted = false;
pool_id_t pool_id = INODE_POOL(op->oid.inode);
auto pool_cfg_it = st_cli.pool_config.find(pool_id);
if (pool_cfg_it != st_cli.pool_config.end())
{
pg_num_t pg_num = (op->oid.stripe/pool_cfg_it->second.pg_stripe_size) % pg_counts[pool_id] + 1; // like map_to_pg()
auto pg_it = pgs.find({ .pool_id = pool_id, .pg_num = pg_num });
if (pg_it != pgs.end())
{
pg_osd_set_state_t *object_state;
get_object_osd_set(pg_it->second, op->oid, pg_it->second.cur_set.data(), &object_state);
if (object_state && (object_state->state & OBJ_DELETED))
{
// Object is deleted, but not from all OSDs - delete remaining copies
is_deleted = true;
}
}
}
op->osd_op = new osd_op_t(); op->osd_op = new osd_op_t();
op->osd_op->op_type = OSD_OP_OUT; op->osd_op->op_type = OSD_OP_OUT;
op->osd_op->req = (osd_any_op_t){ op->osd_op->req = (osd_any_op_t){
@ -294,7 +313,7 @@ void osd_t::submit_recovery_op(osd_recovery_op_t *op)
.header = { .header = {
.magic = SECONDARY_OSD_OP_MAGIC, .magic = SECONDARY_OSD_OP_MAGIC,
.id = 1, .id = 1,
.opcode = OSD_OP_WRITE, .opcode = (uint64_t)(is_deleted ? OSD_OP_DELETE : OSD_OP_WRITE),
}, },
.inode = op->oid.inode, .inode = op->oid.inode,
.offset = op->oid.stripe, .offset = op->oid.stripe,

View File

@ -3,6 +3,8 @@
#pragma once #pragma once
#include <vector>
#define POOL_SCHEME_REPLICATED 1 #define POOL_SCHEME_REPLICATED 1
#define POOL_SCHEME_XOR 2 #define POOL_SCHEME_XOR 2
#define POOL_SCHEME_EC 3 #define POOL_SCHEME_EC 3
@ -38,3 +40,25 @@ inline bool operator != (const pool_pg_num_t & a, const pool_pg_num_t & b)
{ {
return a.pool_id != b.pool_id || a.pg_num != b.pg_num; return a.pool_id != b.pool_id || a.pg_num != b.pg_num;
} }
struct pg_history_set_t
{
std::vector<osd_num_t> osd_set;
uint64_t min_epoch, max_epoch;
};
inline bool operator == (const pg_history_set_t & a, const pg_history_set_t & b)
{
return a.min_epoch == b.min_epoch && a.max_epoch == b.max_epoch && a.osd_set == b.osd_set;
}
inline bool operator != (const pg_history_set_t & a, const pg_history_set_t & b)
{
return a.min_epoch != b.min_epoch || a.max_epoch != b.max_epoch || a.osd_set != b.osd_set;
}
inline bool operator < (const pg_history_set_t & a, const pg_history_set_t & b)
{
return a.min_epoch < b.min_epoch || a.min_epoch == b.min_epoch &&
(a.max_epoch < b.max_epoch || a.max_epoch == b.max_epoch && a.osd_set < b.osd_set);
}

View File

@ -231,7 +231,7 @@ void osd_t::start_pg_peering(pg_t & pg)
for (auto & history_set: pg.target_history) for (auto & history_set: pg.target_history)
{ {
bool found = true; bool found = true;
for (auto history_osd: history_set) for (auto history_osd: history_set.osd_set)
{ {
if (history_osd != 0) if (history_osd != 0)
{ {
@ -471,61 +471,71 @@ void osd_t::finish_stop_pg(pg_t & pg)
report_pg_state(pg); report_pg_state(pg);
} }
static int count_nonzero_osds(const std::vector<osd_num_t> & v)
{
int n = 0;
for (auto & osd_num: v)
{
if (osd_num != 0)
{
n++;
}
}
return n;
}
void osd_t::report_pg_state(pg_t & pg) void osd_t::report_pg_state(pg_t & pg)
{ {
pg.print_state(); pg.print_state();
this->pg_state_dirty.insert({ .pool_id = pg.pool_id, .pg_num = pg.pg_num }); this->pg_state_dirty.insert({ .pool_id = pg.pool_id, .pg_num = pg.pg_num });
if (pg.state == PG_ACTIVE && (pg.target_history.size() > 0 || pg.all_peers.size() > pg.target_set.size())) if ((pg.state == PG_ACTIVE || pg.state == (PG_ACTIVE|PG_LEFT_ON_DEAD)) &&
(pg.target_history.size() != 1 ||
pg.target_history[0].osd_set != pg.target_set ||
pg.target_history[0].min_epoch != 0 ||
pg.target_history[0].max_epoch != pg.epoch ||
pg.all_peers.size() > count_nonzero_osds(pg.target_set)))
{ {
// Clear history of active+clean PGs // Clear history of active+clean PGs
pg.history_changed = true; pg.history_changed = true;
pg.target_history.clear(); pg.target_history.clear();
pg.all_peers = pg.target_set; pg.target_history.push_back((pg_history_set_t){
std::sort(pg.all_peers.begin(), pg.all_peers.end()); .osd_set = pg.cur_set,
pg.cur_peers = pg.target_set; .min_epoch = 0,
// Change pg_config at the same time, otherwise our PG reconciling loop may try to apply the old metadata .max_epoch = pg.epoch,
auto & pg_cfg = st_cli.pool_config[pg.pool_id].pg_config[pg.pg_num]; });
pg_cfg.target_history = pg.target_history; if (pg.state == PG_ACTIVE)
pg_cfg.all_peers = pg.all_peers;
}
else if (pg.state == (PG_ACTIVE|PG_LEFT_ON_DEAD))
{
// Clear history of active+left_on_dead PGs, but leave dead OSDs in all_peers
if (pg.target_history.size())
{ {
pg.history_changed = true; pg.all_peers.clear();
pg.target_history.clear(); for (auto pg_osd: pg.target_set)
}
std::set<osd_num_t> dead_peers;
for (auto pg_osd: pg.all_peers)
{
dead_peers.insert(pg_osd);
}
for (auto pg_osd: pg.cur_peers)
{
dead_peers.erase(pg_osd);
}
for (auto pg_osd: pg.target_set)
{
if (pg_osd)
{ {
dead_peers.insert(pg_osd); if (pg_osd)
pg.all_peers.push_back(pg_osd);
} }
} }
auto new_all_peers = std::vector<osd_num_t>(dead_peers.begin(), dead_peers.end()); else
if (pg.all_peers != new_all_peers)
{ {
pg.history_changed = true; // Clear history of active+left_on_dead PGs, but leave dead OSDs in all_peers
pg.all_peers = new_all_peers; std::set<osd_num_t> dead_peers(pg.all_peers.begin(), pg.all_peers.end());
for (auto pg_osd: pg.cur_peers)
{
dead_peers.erase(pg_osd);
}
for (auto pg_osd: pg.target_set)
{
if (pg_osd)
dead_peers.insert(pg_osd);
}
pg.all_peers.clear();
pg.all_peers.insert(pg.all_peers.begin(), dead_peers.begin(), dead_peers.end());
} }
std::sort(pg.all_peers.begin(), pg.all_peers.end());
pg.cur_peers.clear(); pg.cur_peers.clear();
for (auto pg_osd: pg.target_set) for (auto pg_osd: pg.target_set)
{ {
if (pg_osd) if (pg_osd)
{
pg.cur_peers.push_back(pg_osd); pg.cur_peers.push_back(pg_osd);
}
} }
// Change pg_config at the same time, otherwise our PG reconciling loop may try to apply the old metadata
auto & pg_cfg = st_cli.pool_config[pg.pool_id].pg_config[pg.pg_num]; auto & pg_cfg = st_cli.pool_config[pg.pool_id].pg_config[pg.pg_num];
pg_cfg.target_history = pg.target_history; pg_cfg.target_history = pg.target_history;
pg_cfg.all_peers = pg.all_peers; pg_cfg.all_peers = pg.all_peers;
@ -536,3 +546,51 @@ void osd_t::report_pg_state(pg_t & pg)
} }
report_pg_states(); report_pg_states();
} }
void osd_t::add_pg_history(pg_t & pg)
{
bool epoch_already_reported = false;
int max_epoch_pos = -1;
for (int i = pg.target_history.size()-1; i >= 0; i--)
{
if (pg.target_history[i].min_epoch > pg.epoch)
{
printf("[PG %u/%u] Invalid PG history: there is an entry with min_epoch (%lu) > current epoch (%lu)\n",
pg.pool_id, pg.pg_num, pg.target_history[i].min_epoch, pg.epoch);
force_stop(1);
return;
}
if (max_epoch_pos < 0 || pg.target_history[i].max_epoch > pg.target_history[max_epoch_pos].max_epoch)
{
max_epoch_pos = i;
}
if (pg.target_history[i].min_epoch <= pg.epoch &&
pg.target_history[i].max_epoch >= pg.epoch)
{
if (pg.target_history[i].osd_set != pg.cur_set)
{
printf("[PG %u/%u] Invalid target_history: epoch %lu has another OSD set already registered\n", pg.pool_id, pg.pg_num, pg.epoch);
force_stop(1);
return;
}
// Already reported
epoch_already_reported = true;
break;
}
}
if (!epoch_already_reported)
{
if (max_epoch_pos >= 0 && pg.target_history[max_epoch_pos].osd_set == pg.cur_set)
{
pg.target_history[max_epoch_pos].max_epoch = pg.epoch;
}
else
{
pg.target_history.push_back((pg_history_set_t){
.osd_set = pg.cur_set,
.min_epoch = pg.epoch,
.max_epoch = pg.epoch,
});
}
}
}

View File

@ -52,6 +52,7 @@ struct pg_obj_state_check_t
void walk(); void walk();
void start_object(); void start_object();
void recheck_version_osd_set();
void handle_version(); void handle_version();
void finish_object(); void finish_object();
}; };
@ -84,6 +85,7 @@ void pg_obj_state_check_t::walk()
pg->state = PG_INCOMPLETE | PG_HAS_INVALID; pg->state = PG_INCOMPLETE | PG_HAS_INVALID;
return; return;
} }
// Activate PG
if (pg->pg_cursize < pg->pg_size) if (pg->pg_cursize < pg->pg_size)
{ {
// Activate as degraded // Activate as degraded
@ -108,13 +110,85 @@ void pg_obj_state_check_t::start_object()
n_unstable = n_invalid = 0; n_unstable = n_invalid = 0;
} }
// FIXME: Put this under a feature flag
// FIXME: Implement OSD 'cookies' to be fool-proof so that if an OSD is wiped and
// recreated it doesn't also wipe all other data
void pg_obj_state_check_t::recheck_version_osd_set()
{
uint64_t epoch = (last_ver >> (64-PG_EPOCH_BITS));
if (!pg->epoch_sizes_differ && n_copies >= pg->pg_size)
{
// Enough copies
return;
}
auto epoch_it = pg->target_by_epoch.lower_bound(epoch);
if (epoch_it == pg->target_by_epoch.end() || epoch_it->second.min_epoch > epoch)
{
// Epoch info not found
return;
}
if (pg->epoch_sizes_differ && n_copies >= epoch_it->second.osd_set.size())
{
// For the (unlikely) case of PG size change - enough copies
return;
}
// Recheck version against the OSD set corresponding to epoch if it's known
if (epoch_it != pg->target_by_epoch.end() && epoch_it->second.min_epoch <= epoch)
{
for (int j = 0; j < epoch_it->second.osd_set.size(); j++)
{
osd_num_t cur_osd = epoch_it->second.osd_set[j];
bool found = false;
for (int i = ver_start; i < ver_end; i++)
{
if (cur_osd == list[i].osd_num)
{
found = true;
break;
}
}
if (!found)
{
// Check if a newer version is present on the same OSD and masks the older one
// It happens for overwritten replicas in the following case:
// Version 1 is present on OSD 1,2,3
// Client tries to write Version 2
// OSD 3 succeeds to write Version 2, others don't. OSD 3 crashes, then starts again
// OSD 1 sees: version 1 on OSD 1,2 and version 2 on OSD 3
// (version 1 on OSD 3 is already masked/removed)
// Version 1 is not present on a full set, but it must not be removed
if (replicated)
{
for (int i = obj_start; i < ver_start; i++)
{
if (cur_osd == list[i].osd_num)
{
found = true;
break;
}
}
}
if (!found)
{
// Object is missing from one of the OSDs of that set.
// This means it's deleted or moved and we can safely drop this version.
target_ver = 0;
break;
}
}
}
}
}
void pg_obj_state_check_t::handle_version() void pg_obj_state_check_t::handle_version()
{ {
if (!target_ver && last_ver != list[list_pos].version && (n_stable > 0 || n_roles >= pg->pg_data_size)) if (!target_ver && last_ver != list[list_pos].version && (n_stable > 0 || n_roles >= pg->pg_data_size))
{ {
// Version is either stable or recoverable // Version is either stable or recoverable
target_ver = last_ver;
ver_end = list_pos; ver_end = list_pos;
target_ver = last_ver;
// Skip versions that are not present on any of OSDs for the corresponding PG epoch
recheck_version_osd_set();
} }
if (!target_ver) if (!target_ver)
{ {
@ -178,6 +252,8 @@ void pg_obj_state_check_t::finish_object()
// Version is either stable or recoverable // Version is either stable or recoverable
target_ver = last_ver; target_ver = last_ver;
ver_end = list_pos; ver_end = list_pos;
// Skip versions that are not present on any of OSDs for the corresponding PG epoch
recheck_version_osd_set();
} }
obj_end = list_pos; obj_end = list_pos;
// Remember the decision // Remember the decision
@ -231,11 +307,23 @@ void pg_obj_state_check_t::finish_object()
} }
} }
} }
if (!target_ver) if (!target_ver && (n_unstable >= obj_end-obj_start))
{ {
return; return;
} }
if (!replicated && n_roles < pg->pg_data_size) if (!target_ver)
{
// Object is present, but should not be :) i.e. it's a deleted object that reappeared
if (log_level > 1)
{
printf("Object is deleted: %lx:%lx version=%lu/%lu\n", oid.inode, oid.stripe, target_ver, max_ver);
}
state = OBJ_DELETED;
pg->state = pg->state | PG_HAS_MISPLACED;
// To record all versions as outdated:
ver_end = obj_start;
}
else if (!replicated && n_roles < pg->pg_data_size)
{ {
if (log_level > 1) if (log_level > 1)
{ {
@ -263,7 +351,7 @@ void pg_obj_state_check_t::finish_object()
pg->state = pg->state | PG_HAS_MISPLACED; pg->state = pg->state | PG_HAS_MISPLACED;
} }
if (log_level > 1 && (state & (OBJ_INCOMPLETE | OBJ_DEGRADED)) || if (log_level > 1 && (state & (OBJ_INCOMPLETE | OBJ_DEGRADED)) ||
log_level > 2 && (state & OBJ_MISPLACED)) log_level > 2 && (state & (OBJ_MISPLACED | OBJ_DELETED)))
{ {
for (int i = obj_start; i < obj_end; i++) for (int i = obj_start; i < obj_end; i++)
{ {
@ -272,9 +360,9 @@ void pg_obj_state_check_t::finish_object()
} }
} }
pg->total_count++; pg->total_count++;
if (state != 0 || ver_end < obj_end) osd_set.clear();
if (target_ver != 0 && (state != 0 || ver_end < obj_end))
{ {
osd_set.clear();
for (int i = ver_start; i < ver_end; i++) for (int i = ver_start; i < ver_end; i++)
{ {
osd_set.push_back((pg_obj_loc_t){ osd_set.push_back((pg_obj_loc_t){
@ -297,7 +385,8 @@ void pg_obj_state_check_t::finish_object()
break; break;
} }
} }
if (j >= osd_set.size() && pg->cur_set[list[i].oid.stripe & STRIPE_MASK] != list[i].osd_num) if (j >= osd_set.size() && ((state & OBJ_DELETED) ||
pg->cur_set[list[i].oid.stripe & STRIPE_MASK] != list[i].osd_num))
{ {
osd_set.push_back((pg_obj_loc_t){ osd_set.push_back((pg_obj_loc_t){
.role = (list[i].oid.stripe & STRIPE_MASK), .role = (list[i].oid.stripe & STRIPE_MASK),
@ -312,7 +401,11 @@ void pg_obj_state_check_t::finish_object()
} }
} }
} }
if (target_ver < max_ver) if (state & OBJ_DELETED)
{
pg->ver_override[oid] = max_ver;
}
else if (target_ver < max_ver)
{ {
pg->ver_override[oid] = target_ver; pg->ver_override[oid] = target_ver;
} }
@ -366,6 +459,7 @@ void pg_obj_state_check_t::finish_object()
} }
else else
{ {
assert(it->second.state == state);
it->second.object_count++; it->second.object_count++;
} }
if (state & OBJ_INCOMPLETE) if (state & OBJ_INCOMPLETE)
@ -386,6 +480,34 @@ void pg_obj_state_check_t::finish_object()
// FIXME: Write at least some tests for this function // FIXME: Write at least some tests for this function
void pg_t::calc_object_states(int log_level) void pg_t::calc_object_states(int log_level)
{ {
// Calculate intersections of target_history with cur_peers
for (auto & history_item: target_history)
{
if (history_item.max_epoch)
{
pg_history_set_t & set_copy = target_by_epoch[history_item.max_epoch];
set_copy.min_epoch = history_item.min_epoch;
set_copy.max_epoch = history_item.max_epoch;
for (int i = 0; i < history_item.osd_set.size(); i++)
{
if (history_item.osd_set[i] != 0)
{
for (int j = 0; j < cur_set.size(); j++)
{
if (cur_set[j] == history_item.osd_set[i])
{
set_copy.osd_set.push_back(history_item.osd_set[i]);
break;
}
}
}
}
if (set_copy.osd_set.size() != pg_size)
{
epoch_sizes_differ = true;
}
}
}
// Copy all object lists into one array // Copy all object lists into one array
pg_obj_state_check_t st; pg_obj_state_check_t st;
st.log_level = log_level; st.log_level = log_level;
@ -422,10 +544,18 @@ void pg_t::calc_object_states(int log_level)
std::sort(st.list.begin(), st.list.end()); std::sort(st.list.begin(), st.list.end());
// Walk over it and check object states // Walk over it and check object states
st.walk(); st.walk();
target_by_epoch.clear(); // needed only in this function
if (this->state != PG_ACTIVE) if (this->state != PG_ACTIVE)
{ {
assert(epoch != (((uint64_t)1 << PG_EPOCH_BITS)-1)); assert(epoch != (((uint64_t)1 << PG_EPOCH_BITS)-1));
epoch++; epoch++;
for (auto & pgh: target_history)
{
if (epoch <= pgh.max_epoch)
{
epoch = pgh.max_epoch+1;
}
}
} }
if (log_level > 0) if (log_level > 0)
{ {

View File

@ -89,7 +89,9 @@ struct pg_t
// epoch number - should increase with each non-clean activation of the PG // epoch number - should increase with each non-clean activation of the PG
uint64_t epoch = 0, reported_epoch = 0; uint64_t epoch = 0, reported_epoch = 0;
// target history and all potential peers // target history and all potential peers
std::vector<std::vector<osd_num_t>> target_history; std::vector<pg_history_set_t> target_history;
std::map<uint64_t, pg_history_set_t> target_by_epoch;
bool epoch_sizes_differ = false;
std::vector<osd_num_t> all_peers; std::vector<osd_num_t> all_peers;
bool history_changed = false; bool history_changed = false;
// peer list from the last peering event // peer list from the last peering event

View File

@ -199,6 +199,21 @@ void osd_t::continue_primary_read(osd_op_t *cur_op)
{ {
// PG may be degraded or have misplaced objects // PG may be degraded or have misplaced objects
op_data->prev_set = get_object_osd_set(pg, op_data->oid, pg.cur_set.data(), &op_data->object_state); op_data->prev_set = get_object_osd_set(pg, op_data->oid, pg.cur_set.data(), &op_data->object_state);
if (op_data->object_state && (op_data->object_state->state & OBJ_DELETED))
{
// Object is deleted, just return zeroes
cur_op->reply.rw.version = 0;
cur_op->reply.rw.bitmap_len = op_data->pg_data_size * clean_entry_bitmap_size;
uint64_t zero_len = cur_op->reply.rw.bitmap_len + cur_op->req.rw.len;
while (zero_len >= 0)
{
uint64_t cur_zero_len = zero_buffer_size > zero_len ? zero_len : zero_buffer_size;
cur_op->iov.push_back(zero_buffer, cur_zero_len);
zero_len -= cur_zero_len;
}
finish_op(cur_op, cur_op->req.rw.len);
return;
}
} }
if (pg.state == PG_ACTIVE || op_data->scheme == POOL_SCHEME_REPLICATED) if (pg.state == PG_ACTIVE || op_data->scheme == POOL_SCHEME_REPLICATED)
{ {
@ -290,7 +305,7 @@ void osd_t::remove_object_from_state(object_id & oid, pg_osd_set_state_t *object
report_pg_state(pg); report_pg_state(pg);
} }
} }
else if (object_state->state & OBJ_MISPLACED) else if (object_state->state & (OBJ_MISPLACED | OBJ_DELETED))
{ {
this->misplaced_objects--; this->misplaced_objects--;
pg.misplaced_objects.erase(oid); pg.misplaced_objects.erase(oid);
@ -329,12 +344,6 @@ void osd_t::continue_primary_del(osd_op_t *cur_op)
else if (op_data->st == 4) goto resume_4; else if (op_data->st == 4) goto resume_4;
else if (op_data->st == 5) goto resume_5; else if (op_data->st == 5) goto resume_5;
assert(op_data->st == 0); assert(op_data->st == 0);
// Delete is forbidden even in active PGs if they're also degraded or have previous dead OSDs
if (pg.state & (PG_DEGRADED | PG_LEFT_ON_DEAD))
{
finish_op(cur_op, -EBUSY);
return;
}
if (!check_write_queue(cur_op, pg)) if (!check_write_queue(cur_op, pg))
{ {
return; return;
@ -342,11 +351,18 @@ void osd_t::continue_primary_del(osd_op_t *cur_op)
resume_1: resume_1:
// Determine which OSDs contain this object and delete it // Determine which OSDs contain this object and delete it
op_data->prev_set = get_object_osd_set(pg, op_data->oid, pg.cur_set.data(), &op_data->object_state); op_data->prev_set = get_object_osd_set(pg, op_data->oid, pg.cur_set.data(), &op_data->object_state);
// Submit 1 read to determine the actual version number if (op_data->object_state && (op_data->object_state->state & OBJ_DELETED))
submit_primary_subops(SUBMIT_RMW_READ, UINT64_MAX, op_data->prev_set, cur_op); {
op_data->fact_ver = pg.ver_override[op_data->oid];
}
else
{
// Submit 1 read to determine the actual version number
submit_primary_subops(SUBMIT_RMW_READ, UINT64_MAX, op_data->prev_set, cur_op);
resume_2: resume_2:
op_data->st = 2; op_data->st = 2;
return; return;
}
resume_3: resume_3:
if (op_data->errors > 0) if (op_data->errors > 0)
{ {

View File

@ -133,6 +133,12 @@ int osd_t::collect_bitmap_requests(osd_op_t *cur_op, pg_t & pg, std::vector<bitm
uint64_t target_version = vo_it != pg.ver_override.end() ? vo_it->second : UINT64_MAX; uint64_t target_version = vo_it != pg.ver_override.end() ? vo_it->second : UINT64_MAX;
pg_osd_set_state_t *object_state; pg_osd_set_state_t *object_state;
uint64_t* cur_set = get_object_osd_set(pg, cur_oid, pg.cur_set.data(), &object_state); uint64_t* cur_set = get_object_osd_set(pg, cur_oid, pg.cur_set.data(), &object_state);
if (object_state && (object_state->state & OBJ_DELETED))
{
// Object is deleted, zero out the bitmap
memset((uint8_t*)op_data->snapshot_bitmaps + chain_num*clean_entry_bitmap_size, 0, clean_entry_bitmap_size);
continue;
}
if (pg.scheme == POOL_SCHEME_REPLICATED) if (pg.scheme == POOL_SCHEME_REPLICATED)
{ {
osd_num_t read_target = 0; osd_num_t read_target = 0;

View File

@ -119,17 +119,19 @@ void osd_t::submit_primary_subops(int submit_type, uint64_t op_version, const ui
if (osd_set[role] != 0 && (wr || !rep && stripes[role].read_end != 0)) if (osd_set[role] != 0 && (wr || !rep && stripes[role].read_end != 0))
n_subops++; n_subops++;
} }
if (!n_subops && (submit_type == SUBMIT_RMW_READ || rep)) if (!n_subops && (submit_type == SUBMIT_RMW_READ || rep) && zero_read >= 0)
n_subops = 1; n_subops = 1;
else else
zero_read = -1; zero_read = -1;
osd_op_t *subops = new osd_op_t[n_subops];
op_data->fact_ver = 0; op_data->fact_ver = 0;
op_data->done = op_data->errors = op_data->errcode = 0; op_data->done = op_data->errors = op_data->errcode = 0;
op_data->n_subops = n_subops; op_data->n_subops = n_subops;
op_data->subops = subops; if (n_subops > 0)
int sent = submit_primary_subop_batch(submit_type, op_data->oid.inode, op_version, op_data->stripes, osd_set, cur_op, 0, zero_read); {
assert(sent == n_subops); op_data->subops = new osd_op_t[n_subops];
int sent = submit_primary_subop_batch(submit_type, op_data->oid.inode, op_version, op_data->stripes, osd_set, cur_op, 0, zero_read);
assert(sent == n_subops);
}
} }
int osd_t::submit_primary_subop_batch(int submit_type, inode_t inode, uint64_t op_version, int osd_t::submit_primary_subop_batch(int submit_type, inode_t inode, uint64_t op_version,

View File

@ -156,21 +156,13 @@ resume_3:
{ {
// Report newer epoch before writing // Report newer epoch before writing
// FIXME: We don't have to report all changed PG states here // FIXME: We don't have to report all changed PG states here
this->pg_state_dirty.insert({ .pool_id = pg.pool_id, .pg_num = pg.pg_num });
if (pg.state != PG_ACTIVE) if (pg.state != PG_ACTIVE)
{ {
// Check that current OSD set is in history and/or add it there // Check that current OSD set is in history and/or add it there
std::vector<osd_num_t> history_set; add_pg_history(pg);
for (auto peer_osd: pg.cur_set)
if (peer_osd != 0)
history_set.push_back(peer_osd);
std::sort(history_set.begin(), history_set.end());
auto it = std::lower_bound(pg.target_history.begin(), pg.target_history.end(), history_set);
if (it == pg.target_history.end() || *it != history_set)
pg.target_history.insert(it, history_set);
} }
pg.history_changed = true; pg.history_changed = true;
report_pg_states(); report_pg_state(pg);
resume_10: resume_10:
if (pg.epoch > pg.reported_epoch) if (pg.epoch > pg.reported_epoch)
{ {

View File

@ -32,6 +32,7 @@
#define OBJ_DEGRADED 0x02 #define OBJ_DEGRADED 0x02
#define OBJ_INCOMPLETE 0x04 #define OBJ_INCOMPLETE 0x04
#define OBJ_MISPLACED 0x08 #define OBJ_MISPLACED 0x08
#define OBJ_DELETED 0x10
#define OBJ_NEEDS_STABLE 0x10000 #define OBJ_NEEDS_STABLE 0x10000
#define OBJ_NEEDS_ROLLBACK 0x20000 #define OBJ_NEEDS_ROLLBACK 0x20000

View File

@ -14,6 +14,8 @@ SCHEME=ec ./test_change_pg_count.sh
./test_create_nomaxid.sh ./test_create_nomaxid.sh
./test_degraded_delete.sh
./test_etcd_fail.sh ./test_etcd_fail.sh
./test_failure_domain.sh ./test_failure_domain.sh

131
tests/test_degraded_delete.sh Executable file
View File

@ -0,0 +1,131 @@
#!/bin/bash -ex
# Run 3 OSDs
. `dirname $0`/run_3osds.sh
# Write inodes 1 and 2
LD_PRELOAD="build/src/libfio_vitastor.so" \
fio -thread -name=test -ioengine=build/src/libfio_vitastor.so -bs=1M -direct=1 -iodepth=4 \
-rw=write -etcd=$ETCD_URL -pool=1 -inode=1 -size=128M -runtime=10
LD_PRELOAD="build/src/libfio_vitastor.so" \
fio -thread -name=test -ioengine=build/src/libfio_vitastor.so -bs=1M -direct=1 -iodepth=4 \
-rw=write -etcd=$ETCD_URL -pool=1 -inode=2 -size=128M -runtime=10
LD_PRELOAD="build/src/libfio_vitastor.so" \
fio -thread -name=test -ioengine=build/src/libfio_vitastor.so -bs=4k -direct=1 -iodepth=16 \
-rw=randwrite -etcd=$ETCD_URL -pool=1 -inode=1 -size=128M -runtime=10 &>/dev/null &
sleep 5
# Stop OSD 1
kill -INT $OSD1_PID
sleep 2
# Remove inode 2
build/src/vitastor-cli rm-data --etcd_address $ETCD_URL --pool 1 --inode 2
# Run 3 more OSDs and move PG to 4,5,6
for i in $(seq 4 6); do
dd if=/dev/zero of=./testdata/test_osd$i.bin bs=1024 count=1 seek=$((OSD_SIZE*1024-1))
build/src/vitastor-osd --osd_num $i --bind_address 127.0.0.1 $OSD_ARGS --etcd_address $ETCD_URL $(build/src/vitastor-cli simple-offsets --format options ./testdata/test_osd$i.bin 2>/dev/null) &>./testdata/osd$i.log &
eval OSD${i}_PID=$!
done
$ETCDCTL put /vitastor/config/osd/1 '{"reweight":0}'
$ETCDCTL put /vitastor/config/osd/2 '{"reweight":0}'
$ETCDCTL put /vitastor/config/osd/3 '{"reweight":0}'
# Wait for rebalance to finish
wait_finish_rebalance()
{
local sec=$1
local st=$2
local i=0
while [[ $i -lt $sec ]]; do
if $ETCDCTL get --prefix /vitastor/pg/state/ --print-value-only | jq -s -e \
'([ .[] | select(.state == ['$st'] and (.peers | contains([1]) | not) and (.peers | contains([2,3]) | not)) ] | length) == '$PG_COUNT; then
break
fi
sleep 1
i=$((i+1))
if [ $i -eq $sec ]; then
format_error "Rebalance couldn't finish in $sec seconds"
fi
done
}
wait_finish_rebalance 60 '"active","left_on_dead"'
# Stop OSD 2,3
kill -INT $OSD2_PID
kill -INT $OSD3_PID
sleep 2
# Verify that PGs are still active
if ! ($ETCDCTL get /vitastor/pg/state/1/ --prefix --print-value-only | jq -s -e '[ .[] | select(.state == ["active","left_on_dead"]) ] | length == '$PG_COUNT); then
format_error "FAILED: $PG_COUNT PG(s) NOT UP"
fi
# Start OSD 1
build/src/vitastor-osd --osd_num 1 --bind_address 127.0.0.1 $OSD_ARGS --etcd_address $ETCD_URL $(build/src/vitastor-cli simple-offsets --format options ./testdata/test_osd1.bin 2>/dev/null) &>./testdata/osd1.log &
OSD1_PID=$!
# Verify that inode 2 is removed and inode 1 is in place
wait_repeer_1()
{
local sec=$1
local i=0
while [[ $i -lt $sec ]]; do
if grep -q 'Repeer because of OSD 1' testdata/osd4.log testdata/osd5.log testdata/osd6.log; then
break
fi
sleep 1
i=$((i+1))
if [ $i -eq $sec ]; then
format_error "OSD 4/5/6 do not peer with older OSD 1"
fi
done
}
wait_repeer_1 15
wait_finish_rebalance 15 '"active"'
if [ "$SCHEME" = "replicated" ]; then
NOBJ=1024
else
NOBJ=$((1024/(PG_SIZE-1)))
fi
if ! ($ETCDCTL get /vitastor/pg/stats/1/1 --print-value-only | jq -s -e '.[0].object_count == '$NOBJ); then
format_error "FAILED: PG SHOULD CONTAIN EXACTLY 128 MB OF DATA, BUT IT DOESN'T"
fi
qemu-img convert -S 4096 -p \
-f raw "vitastor:etcd_host=127.0.0.1\:$ETCD_PORT/v3:pool=1:inode=1:size=4096" \
-O raw ./testdata/inode1.bin
qemu-img convert -S 4096 -p \
-f raw "vitastor:etcd_host=127.0.0.1\:$ETCD_PORT/v3:pool=1:inode=2:size="$((128*1024*1024)) \
-O raw ./testdata/inode2.bin
if (dd if=/dev/zero bs=4096 count=1 | diff - ./testdata/inode1.bin); then
format_error "FAILED: INODE 1 SEEMS LOST"
fi
if ! (dd if=/dev/zero bs=1M count=128 | diff - ./testdata/inode2.bin); then
format_error "FAILED: INODE 2 SEEMS RESTORED"
fi
format_green OK