Report pg/history from OSD on every degraded activation

Required to prevent data loss due to activation of an OSD with older data
when PG OSD set change doesn't occur. I.e. fixes the simplest case:
- Run 2 OSDs with 1 PG
- Start writing into the PG
- Stop OSD 2
- Stop OSD 1
- Start OSD 2

After this change the PG will refuse to start after the last step.
nbd-vmsplice
Vitaliy Filippov 2021-11-13 02:06:54 +03:00
parent 577a563b91
commit aa436027c8
7 changed files with 87 additions and 41 deletions

View File

@ -232,7 +232,7 @@ const etcd_tree = {
/* <pool_id>: {
<pg_id>: {
primary: osd_num_t,
state: ("starting"|"peering"|"incomplete"|"active"|"repeering"|"stopping"|"offline"|
state: ("starting"|"peering"|"peered"|"incomplete"|"active"|"repeering"|"stopping"|"offline"|
"degraded"|"has_incomplete"|"has_degraded"|"has_misplaced"|"has_unclean"|
"has_invalid"|"left_on_dead")[],
}

View File

@ -615,7 +615,7 @@ void osd_t::apply_pg_config()
}
if (currently_taken)
{
if (pg_it->second.state & (PG_ACTIVE | PG_INCOMPLETE | PG_PEERING | PG_REPEERING))
if (pg_it->second.state & (PG_ACTIVE | PG_INCOMPLETE | PG_PEERING | PG_REPEERING | PG_PEERED))
{
if (pg_it->second.target_set == pg_cfg.target_set)
{
@ -703,13 +703,19 @@ void osd_t::apply_pg_config()
this->pg_config_applied = all_applied;
}
struct reporting_pg_t
{
pool_pg_num_t pool_pg_num;
bool history_changed;
};
void osd_t::report_pg_states()
{
if (etcd_reporting_pg_state || !this->pg_state_dirty.size() || !st_cli.etcd_addresses.size())
{
return;
}
std::vector<std::pair<pool_pg_num_t,bool>> reporting_pgs;
std::vector<reporting_pg_t> reporting_pgs;
json11::Json::array checks;
json11::Json::array success;
json11::Json::array failure;
@ -721,7 +727,7 @@ void osd_t::report_pg_states()
continue;
}
auto & pg = pg_it->second;
reporting_pgs.push_back({ *it, pg.history_changed });
reporting_pgs.push_back((reporting_pg_t){ *it, pg.history_changed });
std::string state_key_base64 = base64_encode(st_cli.etcd_prefix+"/pg/state/"+std::to_string(pg.pool_id)+"/"+std::to_string(pg.pg_num));
bool pg_state_exists = false;
if (pg.state != PG_STARTING)
@ -827,10 +833,10 @@ void osd_t::report_pg_states()
// One of PG state updates failed, put dirty flags back
for (auto pp: reporting_pgs)
{
this->pg_state_dirty.insert(pp.first);
if (pp.second)
this->pg_state_dirty.insert(pp.pool_pg_num);
if (pp.history_changed)
{
auto pg_it = this->pgs.find(pp.first);
auto pg_it = this->pgs.find(pp.pool_pg_num);
if (pg_it != this->pgs.end())
{
pg_it->second.history_changed = true;
@ -870,17 +876,27 @@ void osd_t::report_pg_states()
// Success. We'll get our changes back via the watcher and react to them
for (auto pp: reporting_pgs)
{
auto pg_it = this->pgs.find(pp.first);
auto pg_it = this->pgs.find(pp.pool_pg_num);
if (pg_it != this->pgs.end() &&
pg_it->second.state == PG_OFFLINE &&
pg_state_dirty.find(pp.first) == pg_state_dirty.end())
pg_state_dirty.find(pp.pool_pg_num) == pg_state_dirty.end())
{
// Forget offline PGs after reporting their state
if (pg_it->second.scheme == POOL_SCHEME_JERASURE)
if (pg_it->second.state == PG_OFFLINE)
{
use_jerasure(pg_it->second.pg_size, pg_it->second.pg_data_size, false);
// Forget offline PGs after reporting their state
// (if the state wasn't changed again)
if (pg_it->second.scheme == POOL_SCHEME_JERASURE)
{
use_jerasure(pg_it->second.pg_size, pg_it->second.pg_data_size, false);
}
this->pgs.erase(pg_it);
}
else if (pg_it->second.state & PG_PEERED)
{
// Activate PG after PG PEERED state is reported along with history
// (if the state wasn't changed again)
pg_it->second.state = pg_it->second.state & ~PG_PEERED | PG_ACTIVE;
report_pg_state(pg_it->second);
}
this->pgs.erase(pg_it);
}
}
// Push other PG state updates, if any

View File

@ -37,6 +37,10 @@ void osd_t::handle_peers()
still = true;
}
}
else if (p.second.state & PG_PEERED)
{
still = true;
}
}
if (!still)
{
@ -57,6 +61,10 @@ void osd_t::handle_peers()
}
still = true;
}
else if (p.second.state & PG_PEERED)
{
still = true;
}
}
if (!still)
{
@ -79,7 +87,7 @@ void osd_t::repeer_pgs(osd_num_t peer_osd)
{
auto & pg = p.second;
bool repeer = false;
if (pg.state & (PG_PEERING | PG_ACTIVE | PG_INCOMPLETE))
if (pg.state & (PG_PEERING | PG_PEERED | PG_ACTIVE | PG_INCOMPLETE))
{
for (osd_num_t pg_osd: pg.all_peers)
{

View File

@ -86,9 +86,24 @@ void pg_obj_state_check_t::walk()
}
if (pg->pg_cursize < pg->pg_size)
{
pg->state |= PG_DEGRADED;
// Report PG history and activate
pg->state |= PG_DEGRADED | PG_PEERED;
std::vector<osd_num_t> history_set;
for (auto peer_osd: pg->cur_set)
{
if (peer_osd != 0)
{
history_set.push_back(peer_osd);
}
}
pg->target_history.push_back(history_set);
pg->history_changed = true;
}
else
{
// Just activate
pg->state |= PG_ACTIVE;
}
pg->state |= PG_ACTIVE;
if (pg->state == PG_ACTIVE && pg->cur_peers.size() < pg->all_peers.size())
{
pg->state |= PG_LEFT_ON_DEAD;
@ -430,10 +445,11 @@ void pg_t::calc_object_states(int log_level)
void pg_t::print_state()
{
printf(
"[PG %u/%u] is %s%s%s%s%s%s%s%s%s%s%s%s%s%s (%lu objects)\n", pool_id, pg_num,
"[PG %u/%u] is %s%s%s%s%s%s%s%s%s%s%s%s%s%s%s (%lu objects)\n", pool_id, pg_num,
(state & PG_STARTING) ? "starting" : "",
(state & PG_OFFLINE) ? "offline" : "",
(state & PG_PEERING) ? "peering" : "",
(state & PG_PEERED) ? "peered" : "",
(state & PG_INCOMPLETE) ? "incomplete" : "",
(state & PG_ACTIVE) ? "active" : "",
(state & PG_REPEERING) ? "repeering" : "",

View File

@ -3,11 +3,12 @@
#include "pg_states.h"
const int pg_state_bit_count = 15;
const int pg_state_bit_count = 16;
const int pg_state_bits[15] = {
const int pg_state_bits[16] = {
PG_STARTING,
PG_PEERING,
PG_PEERED,
PG_INCOMPLETE,
PG_ACTIVE,
PG_REPEERING,
@ -22,9 +23,10 @@ const int pg_state_bits[15] = {
PG_LEFT_ON_DEAD,
};
const char *pg_state_names[15] = {
const char *pg_state_names[16] = {
"starting",
"peering",
"peered",
"incomplete",
"active",
"repeering",

View File

@ -4,23 +4,27 @@
#pragma once
// Placement group states
// STARTING -> [acquire lock] -> PEERING -> INCOMPLETE|ACTIVE -> STOPPING -> OFFLINE -> [release lock]
// STARTING -> [acquire lock] -> PEERING -> PEERED
// PEERED -> [report history if required!] -> INCOMPLETE|ACTIVE
// ACTIVE -> REPEERING -> PEERING
// ACTIVE -> STOPPING -> OFFLINE -> [release lock]
// Exactly one of these:
#define PG_STARTING (1<<0)
#define PG_PEERING (1<<1)
#define PG_INCOMPLETE (1<<2)
#define PG_ACTIVE (1<<3)
#define PG_REPEERING (1<<4)
#define PG_STOPPING (1<<5)
#define PG_OFFLINE (1<<6)
#define PG_PEERED (1<<2)
#define PG_INCOMPLETE (1<<3)
#define PG_ACTIVE (1<<4)
#define PG_REPEERING (1<<5)
#define PG_STOPPING (1<<6)
#define PG_OFFLINE (1<<7)
// Plus any of these:
#define PG_DEGRADED (1<<7)
#define PG_HAS_INCOMPLETE (1<<8)
#define PG_HAS_DEGRADED (1<<9)
#define PG_HAS_MISPLACED (1<<10)
#define PG_HAS_UNCLEAN (1<<11)
#define PG_HAS_INVALID (1<<12)
#define PG_LEFT_ON_DEAD (1<<13)
#define PG_DEGRADED (1<<8)
#define PG_HAS_INCOMPLETE (1<<9)
#define PG_HAS_DEGRADED (1<<10)
#define PG_HAS_MISPLACED (1<<11)
#define PG_HAS_UNCLEAN (1<<12)
#define PG_HAS_INVALID (1<<13)
#define PG_LEFT_ON_DEAD (1<<14)
// Lower bits that represent object role (EC 0/1/2... or always 0 with replication)
// 12 bits is a safe default that doesn't depend on pg_stripe_size or pg_block_size

View File

@ -18,19 +18,19 @@ dd if=/dev/zero of=./testdata/test_osd5.bin bs=1024 count=1 seek=$((1024*1024-1)
dd if=/dev/zero of=./testdata/test_osd6.bin bs=1024 count=1 seek=$((1024*1024-1))
dd if=/dev/zero of=./testdata/test_osd7.bin bs=1024 count=1 seek=$((1024*1024-1))
build/src/vitastor-osd --osd_num 1 --bind_address 127.0.0.1 $NO_SAME --etcd_address $ETCD_URL $(node mon/simple-offsets.js --format options --device ./testdata/test_osd1.bin 2>/dev/null) 2>&1 >>./testdata/osd1.log &
build/src/vitastor-osd --osd_num 1 --bind_address 127.0.0.1 $NO_SAME --etcd_address $ETCD_URL $(node mon/simple-offsets.js --format options --device ./testdata/test_osd1.bin 2>/dev/null) &>./testdata/osd1.log &
OSD1_PID=$!
build/src/vitastor-osd --osd_num 2 --bind_address 127.0.0.1 $NO_SAME --etcd_address $ETCD_URL $(node mon/simple-offsets.js --format options --device ./testdata/test_osd2.bin 2>/dev/null) 2>&1 >>./testdata/osd2.log &
build/src/vitastor-osd --osd_num 2 --bind_address 127.0.0.1 $NO_SAME --etcd_address $ETCD_URL $(node mon/simple-offsets.js --format options --device ./testdata/test_osd2.bin 2>/dev/null) &>./testdata/osd2.log &
OSD2_PID=$!
build/src/vitastor-osd --osd_num 3 --bind_address 127.0.0.1 $NO_SAME --etcd_address $ETCD_URL $(node mon/simple-offsets.js --format options --device ./testdata/test_osd3.bin 2>/dev/null) 2>&1 >>./testdata/osd3.log &
build/src/vitastor-osd --osd_num 3 --bind_address 127.0.0.1 $NO_SAME --etcd_address $ETCD_URL $(node mon/simple-offsets.js --format options --device ./testdata/test_osd3.bin 2>/dev/null) &>./testdata/osd3.log &
OSD3_PID=$!
build/src/vitastor-osd --osd_num 4 --bind_address 127.0.0.1 $NO_SAME --etcd_address $ETCD_URL $(node mon/simple-offsets.js --format options --device ./testdata/test_osd4.bin 2>/dev/null) 2>&1 >>./testdata/osd4.log &
build/src/vitastor-osd --osd_num 4 --bind_address 127.0.0.1 $NO_SAME --etcd_address $ETCD_URL $(node mon/simple-offsets.js --format options --device ./testdata/test_osd4.bin 2>/dev/null) &>./testdata/osd4.log &
OSD4_PID=$!
build/src/vitastor-osd --osd_num 5 --bind_address 127.0.0.1 $NO_SAME --etcd_address $ETCD_URL $(node mon/simple-offsets.js --format options --device ./testdata/test_osd5.bin 2>/dev/null) 2>&1 >>./testdata/osd5.log &
build/src/vitastor-osd --osd_num 5 --bind_address 127.0.0.1 $NO_SAME --etcd_address $ETCD_URL $(node mon/simple-offsets.js --format options --device ./testdata/test_osd5.bin 2>/dev/null) &>./testdata/osd5.log &
OSD5_PID=$!
build/src/vitastor-osd --osd_num 6 --bind_address 127.0.0.1 $NO_SAME --etcd_address $ETCD_URL $(node mon/simple-offsets.js --format options --device ./testdata/test_osd6.bin 2>/dev/null) 2>&1 >>./testdata/osd6.log &
build/src/vitastor-osd --osd_num 6 --bind_address 127.0.0.1 $NO_SAME --etcd_address $ETCD_URL $(node mon/simple-offsets.js --format options --device ./testdata/test_osd6.bin 2>/dev/null) &>./testdata/osd6.log &
OSD6_PID=$!
build/src/vitastor-osd --osd_num 7 --bind_address 127.0.0.1 $NO_SAME --etcd_address $ETCD_URL $(node mon/simple-offsets.js --format options --device ./testdata/test_osd7.bin 2>/dev/null) 2>&1 >>./testdata/osd7.log &
build/src/vitastor-osd --osd_num 7 --bind_address 127.0.0.1 $NO_SAME --etcd_address $ETCD_URL $(node mon/simple-offsets.js --format options --device ./testdata/test_osd7.bin 2>/dev/null) &>./testdata/osd7.log &
OSD7_PID=$!
cd mon