diff --git a/mon/mon.js b/mon/mon.js index d45b0d72..fdb40ce0 100644 --- a/mon/mon.js +++ b/mon/mon.js @@ -232,7 +232,7 @@ const etcd_tree = { /* : { : { primary: osd_num_t, - state: ("starting"|"peering"|"incomplete"|"active"|"repeering"|"stopping"|"offline"| + state: ("starting"|"peering"|"peered"|"incomplete"|"active"|"repeering"|"stopping"|"offline"| "degraded"|"has_incomplete"|"has_degraded"|"has_misplaced"|"has_unclean"| "has_invalid"|"left_on_dead")[], } diff --git a/src/osd_cluster.cpp b/src/osd_cluster.cpp index 18f38c6e..13618a0f 100644 --- a/src/osd_cluster.cpp +++ b/src/osd_cluster.cpp @@ -615,7 +615,7 @@ void osd_t::apply_pg_config() } if (currently_taken) { - if (pg_it->second.state & (PG_ACTIVE | PG_INCOMPLETE | PG_PEERING | PG_REPEERING)) + if (pg_it->second.state & (PG_ACTIVE | PG_INCOMPLETE | PG_PEERING | PG_REPEERING | PG_PEERED)) { if (pg_it->second.target_set == pg_cfg.target_set) { @@ -703,13 +703,19 @@ void osd_t::apply_pg_config() this->pg_config_applied = all_applied; } +struct reporting_pg_t +{ + pool_pg_num_t pool_pg_num; + bool history_changed; +}; + void osd_t::report_pg_states() { if (etcd_reporting_pg_state || !this->pg_state_dirty.size() || !st_cli.etcd_addresses.size()) { return; } - std::vector> reporting_pgs; + std::vector reporting_pgs; json11::Json::array checks; json11::Json::array success; json11::Json::array failure; @@ -721,7 +727,7 @@ void osd_t::report_pg_states() continue; } auto & pg = pg_it->second; - reporting_pgs.push_back({ *it, pg.history_changed }); + reporting_pgs.push_back((reporting_pg_t){ *it, pg.history_changed }); std::string state_key_base64 = base64_encode(st_cli.etcd_prefix+"/pg/state/"+std::to_string(pg.pool_id)+"/"+std::to_string(pg.pg_num)); bool pg_state_exists = false; if (pg.state != PG_STARTING) @@ -827,10 +833,10 @@ void osd_t::report_pg_states() // One of PG state updates failed, put dirty flags back for (auto pp: reporting_pgs) { - this->pg_state_dirty.insert(pp.first); - if (pp.second) + this->pg_state_dirty.insert(pp.pool_pg_num); + if (pp.history_changed) { - auto pg_it = this->pgs.find(pp.first); + auto pg_it = this->pgs.find(pp.pool_pg_num); if (pg_it != this->pgs.end()) { pg_it->second.history_changed = true; @@ -870,17 +876,27 @@ void osd_t::report_pg_states() // Success. We'll get our changes back via the watcher and react to them for (auto pp: reporting_pgs) { - auto pg_it = this->pgs.find(pp.first); + auto pg_it = this->pgs.find(pp.pool_pg_num); if (pg_it != this->pgs.end() && - pg_it->second.state == PG_OFFLINE && - pg_state_dirty.find(pp.first) == pg_state_dirty.end()) + pg_state_dirty.find(pp.pool_pg_num) == pg_state_dirty.end()) { - // Forget offline PGs after reporting their state - if (pg_it->second.scheme == POOL_SCHEME_JERASURE) + if (pg_it->second.state == PG_OFFLINE) { - use_jerasure(pg_it->second.pg_size, pg_it->second.pg_data_size, false); + // Forget offline PGs after reporting their state + // (if the state wasn't changed again) + if (pg_it->second.scheme == POOL_SCHEME_JERASURE) + { + use_jerasure(pg_it->second.pg_size, pg_it->second.pg_data_size, false); + } + this->pgs.erase(pg_it); + } + else if (pg_it->second.state & PG_PEERED) + { + // Activate PG after PG PEERED state is reported along with history + // (if the state wasn't changed again) + pg_it->second.state = pg_it->second.state & ~PG_PEERED | PG_ACTIVE; + report_pg_state(pg_it->second); } - this->pgs.erase(pg_it); } } // Push other PG state updates, if any diff --git a/src/osd_peering.cpp b/src/osd_peering.cpp index 1d2c14ae..3a18487d 100644 --- a/src/osd_peering.cpp +++ b/src/osd_peering.cpp @@ -37,6 +37,10 @@ void osd_t::handle_peers() still = true; } } + else if (p.second.state & PG_PEERED) + { + still = true; + } } if (!still) { @@ -57,6 +61,10 @@ void osd_t::handle_peers() } still = true; } + else if (p.second.state & PG_PEERED) + { + still = true; + } } if (!still) { @@ -79,7 +87,7 @@ void osd_t::repeer_pgs(osd_num_t peer_osd) { auto & pg = p.second; bool repeer = false; - if (pg.state & (PG_PEERING | PG_ACTIVE | PG_INCOMPLETE)) + if (pg.state & (PG_PEERING | PG_PEERED | PG_ACTIVE | PG_INCOMPLETE)) { for (osd_num_t pg_osd: pg.all_peers) { diff --git a/src/osd_peering_pg.cpp b/src/osd_peering_pg.cpp index 9f631d93..c84d883c 100644 --- a/src/osd_peering_pg.cpp +++ b/src/osd_peering_pg.cpp @@ -86,9 +86,24 @@ void pg_obj_state_check_t::walk() } if (pg->pg_cursize < pg->pg_size) { - pg->state |= PG_DEGRADED; + // Report PG history and activate + pg->state |= PG_DEGRADED | PG_PEERED; + std::vector history_set; + for (auto peer_osd: pg->cur_set) + { + if (peer_osd != 0) + { + history_set.push_back(peer_osd); + } + } + pg->target_history.push_back(history_set); + pg->history_changed = true; + } + else + { + // Just activate + pg->state |= PG_ACTIVE; } - pg->state |= PG_ACTIVE; if (pg->state == PG_ACTIVE && pg->cur_peers.size() < pg->all_peers.size()) { pg->state |= PG_LEFT_ON_DEAD; @@ -430,10 +445,11 @@ void pg_t::calc_object_states(int log_level) void pg_t::print_state() { printf( - "[PG %u/%u] is %s%s%s%s%s%s%s%s%s%s%s%s%s%s (%lu objects)\n", pool_id, pg_num, + "[PG %u/%u] is %s%s%s%s%s%s%s%s%s%s%s%s%s%s%s (%lu objects)\n", pool_id, pg_num, (state & PG_STARTING) ? "starting" : "", (state & PG_OFFLINE) ? "offline" : "", (state & PG_PEERING) ? "peering" : "", + (state & PG_PEERED) ? "peered" : "", (state & PG_INCOMPLETE) ? "incomplete" : "", (state & PG_ACTIVE) ? "active" : "", (state & PG_REPEERING) ? "repeering" : "", diff --git a/src/pg_states.cpp b/src/pg_states.cpp index cb1da2c1..86f255cd 100644 --- a/src/pg_states.cpp +++ b/src/pg_states.cpp @@ -3,11 +3,12 @@ #include "pg_states.h" -const int pg_state_bit_count = 15; +const int pg_state_bit_count = 16; -const int pg_state_bits[15] = { +const int pg_state_bits[16] = { PG_STARTING, PG_PEERING, + PG_PEERED, PG_INCOMPLETE, PG_ACTIVE, PG_REPEERING, @@ -22,9 +23,10 @@ const int pg_state_bits[15] = { PG_LEFT_ON_DEAD, }; -const char *pg_state_names[15] = { +const char *pg_state_names[16] = { "starting", "peering", + "peered", "incomplete", "active", "repeering", diff --git a/src/pg_states.h b/src/pg_states.h index 554d754a..eec667c4 100644 --- a/src/pg_states.h +++ b/src/pg_states.h @@ -4,23 +4,27 @@ #pragma once // Placement group states -// STARTING -> [acquire lock] -> PEERING -> INCOMPLETE|ACTIVE -> STOPPING -> OFFLINE -> [release lock] +// STARTING -> [acquire lock] -> PEERING -> PEERED +// PEERED -> [report history if required!] -> INCOMPLETE|ACTIVE +// ACTIVE -> REPEERING -> PEERING +// ACTIVE -> STOPPING -> OFFLINE -> [release lock] // Exactly one of these: #define PG_STARTING (1<<0) #define PG_PEERING (1<<1) -#define PG_INCOMPLETE (1<<2) -#define PG_ACTIVE (1<<3) -#define PG_REPEERING (1<<4) -#define PG_STOPPING (1<<5) -#define PG_OFFLINE (1<<6) +#define PG_PEERED (1<<2) +#define PG_INCOMPLETE (1<<3) +#define PG_ACTIVE (1<<4) +#define PG_REPEERING (1<<5) +#define PG_STOPPING (1<<6) +#define PG_OFFLINE (1<<7) // Plus any of these: -#define PG_DEGRADED (1<<7) -#define PG_HAS_INCOMPLETE (1<<8) -#define PG_HAS_DEGRADED (1<<9) -#define PG_HAS_MISPLACED (1<<10) -#define PG_HAS_UNCLEAN (1<<11) -#define PG_HAS_INVALID (1<<12) -#define PG_LEFT_ON_DEAD (1<<13) +#define PG_DEGRADED (1<<8) +#define PG_HAS_INCOMPLETE (1<<9) +#define PG_HAS_DEGRADED (1<<10) +#define PG_HAS_MISPLACED (1<<11) +#define PG_HAS_UNCLEAN (1<<12) +#define PG_HAS_INVALID (1<<13) +#define PG_LEFT_ON_DEAD (1<<14) // Lower bits that represent object role (EC 0/1/2... or always 0 with replication) // 12 bits is a safe default that doesn't depend on pg_stripe_size or pg_block_size diff --git a/tests/test_interrupted_rebalance.sh b/tests/test_interrupted_rebalance.sh index 2dd28f75..512ecbcd 100755 --- a/tests/test_interrupted_rebalance.sh +++ b/tests/test_interrupted_rebalance.sh @@ -18,19 +18,19 @@ dd if=/dev/zero of=./testdata/test_osd5.bin bs=1024 count=1 seek=$((1024*1024-1) dd if=/dev/zero of=./testdata/test_osd6.bin bs=1024 count=1 seek=$((1024*1024-1)) dd if=/dev/zero of=./testdata/test_osd7.bin bs=1024 count=1 seek=$((1024*1024-1)) -build/src/vitastor-osd --osd_num 1 --bind_address 127.0.0.1 $NO_SAME --etcd_address $ETCD_URL $(node mon/simple-offsets.js --format options --device ./testdata/test_osd1.bin 2>/dev/null) 2>&1 >>./testdata/osd1.log & +build/src/vitastor-osd --osd_num 1 --bind_address 127.0.0.1 $NO_SAME --etcd_address $ETCD_URL $(node mon/simple-offsets.js --format options --device ./testdata/test_osd1.bin 2>/dev/null) &>./testdata/osd1.log & OSD1_PID=$! -build/src/vitastor-osd --osd_num 2 --bind_address 127.0.0.1 $NO_SAME --etcd_address $ETCD_URL $(node mon/simple-offsets.js --format options --device ./testdata/test_osd2.bin 2>/dev/null) 2>&1 >>./testdata/osd2.log & +build/src/vitastor-osd --osd_num 2 --bind_address 127.0.0.1 $NO_SAME --etcd_address $ETCD_URL $(node mon/simple-offsets.js --format options --device ./testdata/test_osd2.bin 2>/dev/null) &>./testdata/osd2.log & OSD2_PID=$! -build/src/vitastor-osd --osd_num 3 --bind_address 127.0.0.1 $NO_SAME --etcd_address $ETCD_URL $(node mon/simple-offsets.js --format options --device ./testdata/test_osd3.bin 2>/dev/null) 2>&1 >>./testdata/osd3.log & +build/src/vitastor-osd --osd_num 3 --bind_address 127.0.0.1 $NO_SAME --etcd_address $ETCD_URL $(node mon/simple-offsets.js --format options --device ./testdata/test_osd3.bin 2>/dev/null) &>./testdata/osd3.log & OSD3_PID=$! -build/src/vitastor-osd --osd_num 4 --bind_address 127.0.0.1 $NO_SAME --etcd_address $ETCD_URL $(node mon/simple-offsets.js --format options --device ./testdata/test_osd4.bin 2>/dev/null) 2>&1 >>./testdata/osd4.log & +build/src/vitastor-osd --osd_num 4 --bind_address 127.0.0.1 $NO_SAME --etcd_address $ETCD_URL $(node mon/simple-offsets.js --format options --device ./testdata/test_osd4.bin 2>/dev/null) &>./testdata/osd4.log & OSD4_PID=$! -build/src/vitastor-osd --osd_num 5 --bind_address 127.0.0.1 $NO_SAME --etcd_address $ETCD_URL $(node mon/simple-offsets.js --format options --device ./testdata/test_osd5.bin 2>/dev/null) 2>&1 >>./testdata/osd5.log & +build/src/vitastor-osd --osd_num 5 --bind_address 127.0.0.1 $NO_SAME --etcd_address $ETCD_URL $(node mon/simple-offsets.js --format options --device ./testdata/test_osd5.bin 2>/dev/null) &>./testdata/osd5.log & OSD5_PID=$! -build/src/vitastor-osd --osd_num 6 --bind_address 127.0.0.1 $NO_SAME --etcd_address $ETCD_URL $(node mon/simple-offsets.js --format options --device ./testdata/test_osd6.bin 2>/dev/null) 2>&1 >>./testdata/osd6.log & +build/src/vitastor-osd --osd_num 6 --bind_address 127.0.0.1 $NO_SAME --etcd_address $ETCD_URL $(node mon/simple-offsets.js --format options --device ./testdata/test_osd6.bin 2>/dev/null) &>./testdata/osd6.log & OSD6_PID=$! -build/src/vitastor-osd --osd_num 7 --bind_address 127.0.0.1 $NO_SAME --etcd_address $ETCD_URL $(node mon/simple-offsets.js --format options --device ./testdata/test_osd7.bin 2>/dev/null) 2>&1 >>./testdata/osd7.log & +build/src/vitastor-osd --osd_num 7 --bind_address 127.0.0.1 $NO_SAME --etcd_address $ETCD_URL $(node mon/simple-offsets.js --format options --device ./testdata/test_osd7.bin 2>/dev/null) &>./testdata/osd7.log & OSD7_PID=$! cd mon