From aa436027c8aa1a788a37f12e118fe46865cb70f0 Mon Sep 17 00:00:00 2001 From: Vitaliy Filippov Date: Sat, 13 Nov 2021 02:06:54 +0300 Subject: [PATCH] Report pg/history from OSD on every degraded activation Required to prevent data loss due to activation of an OSD with older data when PG OSD set change doesn't occur. I.e. fixes the simplest case: - Run 2 OSDs with 1 PG - Start writing into the PG - Stop OSD 2 - Stop OSD 1 - Start OSD 2 After this change the PG will refuse to start after the last step. --- mon/mon.js | 2 +- src/osd_cluster.cpp | 42 ++++++++++++++++++++--------- src/osd_peering.cpp | 10 ++++++- src/osd_peering_pg.cpp | 22 ++++++++++++--- src/pg_states.cpp | 8 +++--- src/pg_states.h | 30 ++++++++++++--------- tests/test_interrupted_rebalance.sh | 14 +++++----- 7 files changed, 87 insertions(+), 41 deletions(-) diff --git a/mon/mon.js b/mon/mon.js index d45b0d72..fdb40ce0 100644 --- a/mon/mon.js +++ b/mon/mon.js @@ -232,7 +232,7 @@ const etcd_tree = { /* : { : { primary: osd_num_t, - state: ("starting"|"peering"|"incomplete"|"active"|"repeering"|"stopping"|"offline"| + state: ("starting"|"peering"|"peered"|"incomplete"|"active"|"repeering"|"stopping"|"offline"| "degraded"|"has_incomplete"|"has_degraded"|"has_misplaced"|"has_unclean"| "has_invalid"|"left_on_dead")[], } diff --git a/src/osd_cluster.cpp b/src/osd_cluster.cpp index 18f38c6e..13618a0f 100644 --- a/src/osd_cluster.cpp +++ b/src/osd_cluster.cpp @@ -615,7 +615,7 @@ void osd_t::apply_pg_config() } if (currently_taken) { - if (pg_it->second.state & (PG_ACTIVE | PG_INCOMPLETE | PG_PEERING | PG_REPEERING)) + if (pg_it->second.state & (PG_ACTIVE | PG_INCOMPLETE | PG_PEERING | PG_REPEERING | PG_PEERED)) { if (pg_it->second.target_set == pg_cfg.target_set) { @@ -703,13 +703,19 @@ void osd_t::apply_pg_config() this->pg_config_applied = all_applied; } +struct reporting_pg_t +{ + pool_pg_num_t pool_pg_num; + bool history_changed; +}; + void osd_t::report_pg_states() { if (etcd_reporting_pg_state || !this->pg_state_dirty.size() || !st_cli.etcd_addresses.size()) { return; } - std::vector> reporting_pgs; + std::vector reporting_pgs; json11::Json::array checks; json11::Json::array success; json11::Json::array failure; @@ -721,7 +727,7 @@ void osd_t::report_pg_states() continue; } auto & pg = pg_it->second; - reporting_pgs.push_back({ *it, pg.history_changed }); + reporting_pgs.push_back((reporting_pg_t){ *it, pg.history_changed }); std::string state_key_base64 = base64_encode(st_cli.etcd_prefix+"/pg/state/"+std::to_string(pg.pool_id)+"/"+std::to_string(pg.pg_num)); bool pg_state_exists = false; if (pg.state != PG_STARTING) @@ -827,10 +833,10 @@ void osd_t::report_pg_states() // One of PG state updates failed, put dirty flags back for (auto pp: reporting_pgs) { - this->pg_state_dirty.insert(pp.first); - if (pp.second) + this->pg_state_dirty.insert(pp.pool_pg_num); + if (pp.history_changed) { - auto pg_it = this->pgs.find(pp.first); + auto pg_it = this->pgs.find(pp.pool_pg_num); if (pg_it != this->pgs.end()) { pg_it->second.history_changed = true; @@ -870,17 +876,27 @@ void osd_t::report_pg_states() // Success. We'll get our changes back via the watcher and react to them for (auto pp: reporting_pgs) { - auto pg_it = this->pgs.find(pp.first); + auto pg_it = this->pgs.find(pp.pool_pg_num); if (pg_it != this->pgs.end() && - pg_it->second.state == PG_OFFLINE && - pg_state_dirty.find(pp.first) == pg_state_dirty.end()) + pg_state_dirty.find(pp.pool_pg_num) == pg_state_dirty.end()) { - // Forget offline PGs after reporting their state - if (pg_it->second.scheme == POOL_SCHEME_JERASURE) + if (pg_it->second.state == PG_OFFLINE) { - use_jerasure(pg_it->second.pg_size, pg_it->second.pg_data_size, false); + // Forget offline PGs after reporting their state + // (if the state wasn't changed again) + if (pg_it->second.scheme == POOL_SCHEME_JERASURE) + { + use_jerasure(pg_it->second.pg_size, pg_it->second.pg_data_size, false); + } + this->pgs.erase(pg_it); + } + else if (pg_it->second.state & PG_PEERED) + { + // Activate PG after PG PEERED state is reported along with history + // (if the state wasn't changed again) + pg_it->second.state = pg_it->second.state & ~PG_PEERED | PG_ACTIVE; + report_pg_state(pg_it->second); } - this->pgs.erase(pg_it); } } // Push other PG state updates, if any diff --git a/src/osd_peering.cpp b/src/osd_peering.cpp index 1d2c14ae..3a18487d 100644 --- a/src/osd_peering.cpp +++ b/src/osd_peering.cpp @@ -37,6 +37,10 @@ void osd_t::handle_peers() still = true; } } + else if (p.second.state & PG_PEERED) + { + still = true; + } } if (!still) { @@ -57,6 +61,10 @@ void osd_t::handle_peers() } still = true; } + else if (p.second.state & PG_PEERED) + { + still = true; + } } if (!still) { @@ -79,7 +87,7 @@ void osd_t::repeer_pgs(osd_num_t peer_osd) { auto & pg = p.second; bool repeer = false; - if (pg.state & (PG_PEERING | PG_ACTIVE | PG_INCOMPLETE)) + if (pg.state & (PG_PEERING | PG_PEERED | PG_ACTIVE | PG_INCOMPLETE)) { for (osd_num_t pg_osd: pg.all_peers) { diff --git a/src/osd_peering_pg.cpp b/src/osd_peering_pg.cpp index 9f631d93..c84d883c 100644 --- a/src/osd_peering_pg.cpp +++ b/src/osd_peering_pg.cpp @@ -86,9 +86,24 @@ void pg_obj_state_check_t::walk() } if (pg->pg_cursize < pg->pg_size) { - pg->state |= PG_DEGRADED; + // Report PG history and activate + pg->state |= PG_DEGRADED | PG_PEERED; + std::vector history_set; + for (auto peer_osd: pg->cur_set) + { + if (peer_osd != 0) + { + history_set.push_back(peer_osd); + } + } + pg->target_history.push_back(history_set); + pg->history_changed = true; + } + else + { + // Just activate + pg->state |= PG_ACTIVE; } - pg->state |= PG_ACTIVE; if (pg->state == PG_ACTIVE && pg->cur_peers.size() < pg->all_peers.size()) { pg->state |= PG_LEFT_ON_DEAD; @@ -430,10 +445,11 @@ void pg_t::calc_object_states(int log_level) void pg_t::print_state() { printf( - "[PG %u/%u] is %s%s%s%s%s%s%s%s%s%s%s%s%s%s (%lu objects)\n", pool_id, pg_num, + "[PG %u/%u] is %s%s%s%s%s%s%s%s%s%s%s%s%s%s%s (%lu objects)\n", pool_id, pg_num, (state & PG_STARTING) ? "starting" : "", (state & PG_OFFLINE) ? "offline" : "", (state & PG_PEERING) ? "peering" : "", + (state & PG_PEERED) ? "peered" : "", (state & PG_INCOMPLETE) ? "incomplete" : "", (state & PG_ACTIVE) ? "active" : "", (state & PG_REPEERING) ? "repeering" : "", diff --git a/src/pg_states.cpp b/src/pg_states.cpp index cb1da2c1..86f255cd 100644 --- a/src/pg_states.cpp +++ b/src/pg_states.cpp @@ -3,11 +3,12 @@ #include "pg_states.h" -const int pg_state_bit_count = 15; +const int pg_state_bit_count = 16; -const int pg_state_bits[15] = { +const int pg_state_bits[16] = { PG_STARTING, PG_PEERING, + PG_PEERED, PG_INCOMPLETE, PG_ACTIVE, PG_REPEERING, @@ -22,9 +23,10 @@ const int pg_state_bits[15] = { PG_LEFT_ON_DEAD, }; -const char *pg_state_names[15] = { +const char *pg_state_names[16] = { "starting", "peering", + "peered", "incomplete", "active", "repeering", diff --git a/src/pg_states.h b/src/pg_states.h index 554d754a..eec667c4 100644 --- a/src/pg_states.h +++ b/src/pg_states.h @@ -4,23 +4,27 @@ #pragma once // Placement group states -// STARTING -> [acquire lock] -> PEERING -> INCOMPLETE|ACTIVE -> STOPPING -> OFFLINE -> [release lock] +// STARTING -> [acquire lock] -> PEERING -> PEERED +// PEERED -> [report history if required!] -> INCOMPLETE|ACTIVE +// ACTIVE -> REPEERING -> PEERING +// ACTIVE -> STOPPING -> OFFLINE -> [release lock] // Exactly one of these: #define PG_STARTING (1<<0) #define PG_PEERING (1<<1) -#define PG_INCOMPLETE (1<<2) -#define PG_ACTIVE (1<<3) -#define PG_REPEERING (1<<4) -#define PG_STOPPING (1<<5) -#define PG_OFFLINE (1<<6) +#define PG_PEERED (1<<2) +#define PG_INCOMPLETE (1<<3) +#define PG_ACTIVE (1<<4) +#define PG_REPEERING (1<<5) +#define PG_STOPPING (1<<6) +#define PG_OFFLINE (1<<7) // Plus any of these: -#define PG_DEGRADED (1<<7) -#define PG_HAS_INCOMPLETE (1<<8) -#define PG_HAS_DEGRADED (1<<9) -#define PG_HAS_MISPLACED (1<<10) -#define PG_HAS_UNCLEAN (1<<11) -#define PG_HAS_INVALID (1<<12) -#define PG_LEFT_ON_DEAD (1<<13) +#define PG_DEGRADED (1<<8) +#define PG_HAS_INCOMPLETE (1<<9) +#define PG_HAS_DEGRADED (1<<10) +#define PG_HAS_MISPLACED (1<<11) +#define PG_HAS_UNCLEAN (1<<12) +#define PG_HAS_INVALID (1<<13) +#define PG_LEFT_ON_DEAD (1<<14) // Lower bits that represent object role (EC 0/1/2... or always 0 with replication) // 12 bits is a safe default that doesn't depend on pg_stripe_size or pg_block_size diff --git a/tests/test_interrupted_rebalance.sh b/tests/test_interrupted_rebalance.sh index 2dd28f75..512ecbcd 100755 --- a/tests/test_interrupted_rebalance.sh +++ b/tests/test_interrupted_rebalance.sh @@ -18,19 +18,19 @@ dd if=/dev/zero of=./testdata/test_osd5.bin bs=1024 count=1 seek=$((1024*1024-1) dd if=/dev/zero of=./testdata/test_osd6.bin bs=1024 count=1 seek=$((1024*1024-1)) dd if=/dev/zero of=./testdata/test_osd7.bin bs=1024 count=1 seek=$((1024*1024-1)) -build/src/vitastor-osd --osd_num 1 --bind_address 127.0.0.1 $NO_SAME --etcd_address $ETCD_URL $(node mon/simple-offsets.js --format options --device ./testdata/test_osd1.bin 2>/dev/null) 2>&1 >>./testdata/osd1.log & +build/src/vitastor-osd --osd_num 1 --bind_address 127.0.0.1 $NO_SAME --etcd_address $ETCD_URL $(node mon/simple-offsets.js --format options --device ./testdata/test_osd1.bin 2>/dev/null) &>./testdata/osd1.log & OSD1_PID=$! -build/src/vitastor-osd --osd_num 2 --bind_address 127.0.0.1 $NO_SAME --etcd_address $ETCD_URL $(node mon/simple-offsets.js --format options --device ./testdata/test_osd2.bin 2>/dev/null) 2>&1 >>./testdata/osd2.log & +build/src/vitastor-osd --osd_num 2 --bind_address 127.0.0.1 $NO_SAME --etcd_address $ETCD_URL $(node mon/simple-offsets.js --format options --device ./testdata/test_osd2.bin 2>/dev/null) &>./testdata/osd2.log & OSD2_PID=$! -build/src/vitastor-osd --osd_num 3 --bind_address 127.0.0.1 $NO_SAME --etcd_address $ETCD_URL $(node mon/simple-offsets.js --format options --device ./testdata/test_osd3.bin 2>/dev/null) 2>&1 >>./testdata/osd3.log & +build/src/vitastor-osd --osd_num 3 --bind_address 127.0.0.1 $NO_SAME --etcd_address $ETCD_URL $(node mon/simple-offsets.js --format options --device ./testdata/test_osd3.bin 2>/dev/null) &>./testdata/osd3.log & OSD3_PID=$! -build/src/vitastor-osd --osd_num 4 --bind_address 127.0.0.1 $NO_SAME --etcd_address $ETCD_URL $(node mon/simple-offsets.js --format options --device ./testdata/test_osd4.bin 2>/dev/null) 2>&1 >>./testdata/osd4.log & +build/src/vitastor-osd --osd_num 4 --bind_address 127.0.0.1 $NO_SAME --etcd_address $ETCD_URL $(node mon/simple-offsets.js --format options --device ./testdata/test_osd4.bin 2>/dev/null) &>./testdata/osd4.log & OSD4_PID=$! -build/src/vitastor-osd --osd_num 5 --bind_address 127.0.0.1 $NO_SAME --etcd_address $ETCD_URL $(node mon/simple-offsets.js --format options --device ./testdata/test_osd5.bin 2>/dev/null) 2>&1 >>./testdata/osd5.log & +build/src/vitastor-osd --osd_num 5 --bind_address 127.0.0.1 $NO_SAME --etcd_address $ETCD_URL $(node mon/simple-offsets.js --format options --device ./testdata/test_osd5.bin 2>/dev/null) &>./testdata/osd5.log & OSD5_PID=$! -build/src/vitastor-osd --osd_num 6 --bind_address 127.0.0.1 $NO_SAME --etcd_address $ETCD_URL $(node mon/simple-offsets.js --format options --device ./testdata/test_osd6.bin 2>/dev/null) 2>&1 >>./testdata/osd6.log & +build/src/vitastor-osd --osd_num 6 --bind_address 127.0.0.1 $NO_SAME --etcd_address $ETCD_URL $(node mon/simple-offsets.js --format options --device ./testdata/test_osd6.bin 2>/dev/null) &>./testdata/osd6.log & OSD6_PID=$! -build/src/vitastor-osd --osd_num 7 --bind_address 127.0.0.1 $NO_SAME --etcd_address $ETCD_URL $(node mon/simple-offsets.js --format options --device ./testdata/test_osd7.bin 2>/dev/null) 2>&1 >>./testdata/osd7.log & +build/src/vitastor-osd --osd_num 7 --bind_address 127.0.0.1 $NO_SAME --etcd_address $ETCD_URL $(node mon/simple-offsets.js --format options --device ./testdata/test_osd7.bin 2>/dev/null) &>./testdata/osd7.log & OSD7_PID=$! cd mon