Implement PG state locking and PG moving in response to etcd events

trace-sqes
Vitaliy Filippov 2020-04-27 14:32:59 +03:00
parent ec4a52af48
commit 7b57eeeeb3
13 changed files with 784 additions and 289 deletions

View File

@ -67,6 +67,8 @@ rw_blocking.o: rw_blocking.cpp rw_blocking.h
g++ $(CXXFLAGS) -c -o $@ $< g++ $(CXXFLAGS) -c -o $@ $<
osd_test: osd_test.cpp osd_ops.h rw_blocking.o osd_test: osd_test.cpp osd_ops.h rw_blocking.o
g++ $(CXXFLAGS) -o osd_test osd_test.cpp rw_blocking.o -ltcmalloc_minimal g++ $(CXXFLAGS) -o osd_test osd_test.cpp rw_blocking.o -ltcmalloc_minimal
osd_peering_pg_test: osd_peering_pg_test.cpp osd_peering_pg.o
g++ $(CXXFLAGS) -o $@ $< osd_peering_pg.o -ltcmalloc_minimal
libfio_sec_osd.so: fio_sec_osd.cpp osd_ops.h rw_blocking.o libfio_sec_osd.so: fio_sec_osd.cpp osd_ops.h rw_blocking.o
g++ $(CXXFLAGS) -ltcmalloc_minimal -shared -o libfio_sec_osd.so fio_sec_osd.cpp rw_blocking.o -luring g++ $(CXXFLAGS) -ltcmalloc_minimal -shared -o libfio_sec_osd.so fio_sec_osd.cpp rw_blocking.o -luring

18
osd.cpp
View File

@ -106,10 +106,8 @@ void osd_t::parse_config(blockstore_config_t & config)
int major, minor; int major, minor;
if (sscanf(config["etcd_version"].c_str(), "%d.%d", &major, &minor) < 2) if (sscanf(config["etcd_version"].c_str(), "%d.%d", &major, &minor) < 2)
throw std::runtime_error("etcd_version should be in the form MAJOR.MINOR (for example, 3.2)"); throw std::runtime_error("etcd_version should be in the form MAJOR.MINOR (for example, 3.2)");
if (major < 3 || major == 3 && minor < 2) if (major < 3 || major == 3 && minor < 3)
throw std::runtime_error("Your etcd is too old, minimum required version is 3.2"); throw std::runtime_error("Your etcd is too old, minimum required version is 3.3");
else if (major == 3 && minor == 2)
etcd_api_path = "/v3alpha";
else if (major == 3 && minor == 3) else if (major == 3 && minor == 3)
etcd_api_path = "/v3beta"; etcd_api_path = "/v3beta";
else else
@ -117,10 +115,6 @@ void osd_t::parse_config(blockstore_config_t & config)
} }
else else
etcd_api_path = "/v3"; etcd_api_path = "/v3";
if ((pos = etcd_address.find(':')) >= 0)
etcd_host = etcd_address.substr(0, pos);
else
etcd_host = etcd_address;
etcd_report_interval = strtoull(config["etcd_report_interval"].c_str(), NULL, 10); etcd_report_interval = strtoull(config["etcd_report_interval"].c_str(), NULL, 10);
if (etcd_report_interval <= 0) if (etcd_report_interval <= 0)
etcd_report_interval = 30; etcd_report_interval = 30;
@ -153,12 +147,10 @@ void osd_t::parse_config(blockstore_config_t & config)
peer_connect_interval = strtoull(config["peer_connect_interval"].c_str(), NULL, 10); peer_connect_interval = strtoull(config["peer_connect_interval"].c_str(), NULL, 10);
if (!peer_connect_interval) if (!peer_connect_interval)
peer_connect_interval = 5; peer_connect_interval = 5;
http_request_timeout = strtoull(config["http_request_timeout"].c_str(), NULL, 10);
if (!http_request_timeout)
http_request_timeout = 5;
peer_connect_timeout = strtoull(config["peer_connect_timeout"].c_str(), NULL, 10); peer_connect_timeout = strtoull(config["peer_connect_timeout"].c_str(), NULL, 10);
if (!peer_connect_timeout) if (!peer_connect_timeout)
peer_connect_timeout = 5; peer_connect_timeout = 5;
log_level = strtoull(config["log_level"].c_str(), NULL, 10);
} }
void osd_t::bind_socket() void osd_t::bind_socket()
@ -394,13 +386,13 @@ void osd_t::stop_client(int peer_fd)
if (cl.osd_num) if (cl.osd_num)
{ {
// Reload configuration from etcd when the connection is dropped // Reload configuration from etcd when the connection is dropped
printf("[%lu] Stopping client %d (OSD peer %lu)\n", osd_num, peer_fd, cl.osd_num); printf("[OSD %lu] Stopping client %d (OSD peer %lu)\n", osd_num, peer_fd, cl.osd_num);
peer_states.erase(cl.osd_num); peer_states.erase(cl.osd_num);
repeer_pgs(cl.osd_num); repeer_pgs(cl.osd_num);
} }
else else
{ {
printf("[%lu] Stopping client %d (regular client)\n", osd_num, peer_fd); printf("[OSD %lu] Stopping client %d (regular client)\n", osd_num, peer_fd);
} }
} }
clients.erase(it); clients.erase(it);

70
osd.h
View File

@ -18,6 +18,7 @@
#include "timerfd_manager.h" #include "timerfd_manager.h"
#include "osd_ops.h" #include "osd_ops.h"
#include "osd_peering_pg.h" #include "osd_peering_pg.h"
#include "osd_http.h"
#include "json11/json11.hpp" #include "json11/json11.hpp"
#define OSD_OP_IN 0 #define OSD_OP_IN 0
@ -49,10 +50,6 @@
#define MAX_RECOVERY_QUEUE 2048 #define MAX_RECOVERY_QUEUE 2048
#define DEFAULT_RECOVERY_QUEUE 4 #define DEFAULT_RECOVERY_QUEUE 4
#define MAX_ETCD_ATTEMPTS 5
#define ETCD_START_INTERVAL 5000
#define ETCD_RETRY_INTERVAL 1000
//#define OSD_STUB //#define OSD_STUB
extern const char* osd_op_names[]; extern const char* osd_op_names[];
@ -189,16 +186,30 @@ struct osd_wanted_peer_t
int address_index; int address_index;
}; };
struct http_response_t; struct pg_config_t
{
bool exists;
osd_num_t primary;
std::vector<osd_num_t> target_set;
std::vector<std::vector<osd_num_t>> target_history;
bool pause;
osd_num_t cur_primary;
int cur_state;
};
struct websocket_t; struct json_kv_t
{
std::string key;
json11::Json value;
};
class osd_t class osd_t
{ {
// config // config
blockstore_config_t config; blockstore_config_t config;
std::string etcd_address, etcd_host, etcd_prefix, etcd_api_path; // FIXME Allow multiple etcd addresses and select random address
std::string etcd_address, etcd_prefix, etcd_api_path;
int etcd_report_interval = 30; int etcd_report_interval = 30;
bool readonly = false; bool readonly = false;
@ -214,16 +225,26 @@ class osd_t
int autosync_interval = DEFAULT_AUTOSYNC_INTERVAL; // sync every 5 seconds int autosync_interval = DEFAULT_AUTOSYNC_INTERVAL; // sync every 5 seconds
int recovery_queue_depth = DEFAULT_RECOVERY_QUEUE; int recovery_queue_depth = DEFAULT_RECOVERY_QUEUE;
int peer_connect_interval = 5; int peer_connect_interval = 5;
int http_request_timeout = 5;
int peer_connect_timeout = 5; int peer_connect_timeout = 5;
int log_level = 0;
// peer OSDs // cluster state
std::string etcd_lease_id, etcd_watch_revision; std::string etcd_lease_id;
int etcd_watches_initialised = 0;
uint64_t etcd_watch_revision = 0;
websocket_t *etcd_watch_ws = NULL;
std::map<osd_num_t, json11::Json> peer_states; std::map<osd_num_t, json11::Json> peer_states;
std::map<osd_num_t, osd_wanted_peer_t> wanted_peers; std::map<osd_num_t, osd_wanted_peer_t> wanted_peers;
bool loading_peer_config = false; bool loading_peer_config = false;
int etcd_failed_attempts = 0; int etcd_failed_attempts = 0;
std::map<pg_num_t, pg_config_t> pg_config;
std::set<pg_num_t> pg_state_dirty;
bool pg_config_applied = false;
bool etcd_reporting_pg_state = false;
bool etcd_reporting_stats = false;
// peers and PGs
std::map<uint64_t, int> osd_peer_fds; std::map<uint64_t, int> osd_peer_fds;
std::map<pg_num_t, pg_t> pgs; std::map<pg_num_t, pg_t> pgs;
@ -267,26 +288,36 @@ class osd_t
uint64_t subop_stat_count[2][OSD_OP_MAX+1] = { 0 }; uint64_t subop_stat_count[2][OSD_OP_MAX+1] = { 0 };
// cluster connection // cluster connection
void http_request(std::string host, std::string request, bool streaming, std::function<void(const http_response_t *response)> callback); void http_request(const std::string & host, const std::string & request,
void http_request_json(std::string host, std::string request, std::function<void(std::string, json11::Json data)> callback); const http_options_t & options, std::function<void(const http_response_t *response)> callback);
websocket_t* open_websocket(std::string host, std::string path, std::function<void(const http_response_t *msg)> callback); void http_request_json(const std::string & host, const std::string & request, int timeout,
void etcd_call(std::string api, json11::Json payload, std::function<void(std::string, json11::Json)> callback); std::function<void(std::string, json11::Json data)> callback);
void etcd_txn(json11::Json txn, std::function<void(std::string, json11::Json)> callback); websocket_t* open_websocket(const std::string & host, const std::string & path, int timeout,
std::function<void(const http_response_t *msg)> callback);
void etcd_call(std::string api, json11::Json payload, int timeout, std::function<void(std::string, json11::Json)> callback);
void etcd_txn(json11::Json txn, int timeout, std::function<void(std::string, json11::Json)> callback);
json_kv_t parse_etcd_kv(const json11::Json & kv_json);
void parse_config(blockstore_config_t & config); void parse_config(blockstore_config_t & config);
void init_cluster(); void init_cluster();
void start_etcd_watcher();
void load_global_config(); void load_global_config();
void bind_socket(); void bind_socket();
void acquire_lease(); void acquire_lease();
void create_state(); json11::Json get_osd_state();
void create_osd_state();
void renew_lease(); void renew_lease();
void print_stats(); void print_stats();
void reset_stats(); void reset_stats();
json11::Json get_status();
json11::Json get_statistics(); json11::Json get_statistics();
void report_statistics(); void report_statistics();
void report_pg_state(pg_t & pg);
void report_pg_states();
void load_pgs(); void load_pgs();
void parse_pgs(const json11::Json & pg_config, const std::map<pg_num_t, json11::Json> & pg_history); void parse_pg_state(const std::string & key, const json11::Json & value);
void apply_pg_count();
void apply_pg_config();
void load_and_connect_peers(); void load_and_connect_peers();
void parse_etcd_osd_state(const std::string & key, const json11::Json & value);
// event loop, socket read/write // event loop, socket read/write
void loop(); void loop();
@ -313,6 +344,7 @@ class osd_t
void repeer_pgs(osd_num_t osd_num); void repeer_pgs(osd_num_t osd_num);
void start_pg_peering(pg_num_t pg_num); void start_pg_peering(pg_num_t pg_num);
void submit_list_subop(osd_num_t role_osd, pg_peering_state_t *ps); void submit_list_subop(osd_num_t role_osd, pg_peering_state_t *ps);
void discard_list_subop(osd_op_t *list_op);
bool stop_pg(pg_num_t pg_num); bool stop_pg(pg_num_t pg_num);
void finish_stop_pg(pg_t & pg); void finish_stop_pg(pg_t & pg);
@ -356,7 +388,7 @@ class osd_t
public: public:
osd_t(blockstore_config_t & config, blockstore_t *bs, ring_loop_t *ringloop); osd_t(blockstore_config_t & config, blockstore_t *bs, ring_loop_t *ringloop);
~osd_t(); ~osd_t();
void force_stop(); void force_stop(int exitcode);
bool shutdown(); bool shutdown();
}; };

File diff suppressed because it is too large Load Diff

View File

@ -129,12 +129,16 @@ void osd_t::handle_flush_op(pg_num_t pg_num, pg_flush_batch_t *fb, osd_num_t osd
if (!pg.flush_actions.size()) if (!pg.flush_actions.size())
{ {
pg.state = pg.state & ~PG_HAS_UNCLEAN; pg.state = pg.state & ~PG_HAS_UNCLEAN;
pg.print_state(); report_pg_state(pg);
} }
for (osd_op_t *op: continue_ops) for (osd_op_t *op: continue_ops)
{ {
continue_primary_write(op); continue_primary_write(op);
} }
if (pg.inflight == 0 && (pg.state & PG_STOPPING))
{
finish_stop_pg(pg);
}
} }
} }

View File

@ -18,6 +18,7 @@ static std::string trim(const std::string & in);
static std::string ws_format_frame(int type, uint64_t size); static std::string ws_format_frame(int type, uint64_t size);
static bool ws_parse_frame(std::string & buf, int & type, std::string & res); static bool ws_parse_frame(std::string & buf, int & type, std::string & res);
// FIXME: Use keepalive
struct http_co_t struct http_co_t
{ {
ring_loop_t *ringloop; ring_loop_t *ringloop;
@ -66,15 +67,19 @@ struct http_co_t
#define HTTP_CO_WEBSOCKET 5 #define HTTP_CO_WEBSOCKET 5
#define HTTP_CO_CHUNKED 6 #define HTTP_CO_CHUNKED 6
void osd_t::http_request(std::string host, std::string request, bool streaming, std::function<void(const http_response_t *response)> callback) #define DEFAULT_TIMEOUT 5000
// FIXME: Remove osd_t dependency from here
void osd_t::http_request(const std::string & host, const std::string & request,
const http_options_t & options, std::function<void(const http_response_t *response)> callback)
{ {
http_co_t *handler = new http_co_t(); http_co_t *handler = new http_co_t();
handler->ringloop = ringloop; handler->ringloop = ringloop;
handler->epoll_fd = epoll_fd; handler->epoll_fd = epoll_fd;
handler->epoll_handlers = &epoll_handlers; handler->epoll_handlers = &epoll_handlers;
handler->request_timeout = http_request_timeout; handler->request_timeout = options.timeout < 0 ? 0 : (options.timeout == 0 ? DEFAULT_TIMEOUT : options.timeout);
handler->want_streaming = options.want_streaming;
handler->tfd = tfd; handler->tfd = tfd;
handler->want_streaming = streaming;
handler->host = host; handler->host = host;
handler->request = request; handler->request = request;
handler->callback = callback; handler->callback = callback;
@ -82,10 +87,10 @@ void osd_t::http_request(std::string host, std::string request, bool streaming,
handler->start_connection(); handler->start_connection();
} }
void osd_t::http_request_json(std::string host, std::string request, void osd_t::http_request_json(const std::string & host, const std::string & request,
std::function<void(std::string, json11::Json r)> callback) int timeout, std::function<void(std::string, json11::Json r)> callback)
{ {
http_request(host, request, false, [this, callback](const http_response_t* res) http_request(host, request, { .timeout = timeout }, [this, callback](const http_response_t* res)
{ {
if (res->error_code != 0) if (res->error_code != 0)
{ {
@ -108,7 +113,8 @@ void osd_t::http_request_json(std::string host, std::string request,
}); });
} }
websocket_t* osd_t::open_websocket(std::string host, std::string path, std::function<void(const http_response_t *msg)> callback) websocket_t* osd_t::open_websocket(const std::string & host, const std::string & path,
int timeout, std::function<void(const http_response_t *msg)> callback)
{ {
std::string request = "GET "+path+" HTTP/1.1\r\n" std::string request = "GET "+path+" HTTP/1.1\r\n"
"Host: "+host+"\r\n" "Host: "+host+"\r\n"
@ -121,9 +127,9 @@ websocket_t* osd_t::open_websocket(std::string host, std::string path, std::func
handler->ringloop = ringloop; handler->ringloop = ringloop;
handler->epoll_fd = epoll_fd; handler->epoll_fd = epoll_fd;
handler->epoll_handlers = &epoll_handlers; handler->epoll_handlers = &epoll_handlers;
handler->request_timeout = http_request_timeout; handler->request_timeout = timeout < 0 ? -1 : (timeout == 0 ? DEFAULT_TIMEOUT : timeout);
handler->tfd = tfd;
handler->want_streaming = false; handler->want_streaming = false;
handler->tfd = tfd;
handler->host = host; handler->host = host;
handler->request = request; handler->request = request;
handler->callback = callback; handler->callback = callback;
@ -215,11 +221,11 @@ void http_co_t::start_connection()
fcntl(peer_fd, F_SETFL, fcntl(peer_fd, F_GETFL, 0) | O_NONBLOCK); fcntl(peer_fd, F_SETFL, fcntl(peer_fd, F_GETFL, 0) | O_NONBLOCK);
if (request_timeout > 0) if (request_timeout > 0)
{ {
timeout_id = tfd->set_timer(1000*request_timeout, false, [this](int timer_id) timeout_id = tfd->set_timer(request_timeout, false, [this](int timer_id)
{ {
if (response.length() == 0) if (response.length() == 0)
{ {
parsed.error_code = EIO; parsed.error_code = ETIME;
} }
delete this; delete this;
}); });
@ -440,8 +446,8 @@ void http_co_t::handle_read()
if (!len) if (!len)
{ {
// Zero length chunk indicates EOF // Zero length chunk indicates EOF
delete this; parsed.eof = true;
return; break;
} }
if (response.size() < pos+2+len+2) if (response.size() < pos+2+len+2)
{ {
@ -454,6 +460,11 @@ void http_co_t::handle_read()
{ {
response = response.substr(prev); response = response.substr(prev);
} }
if (parsed.eof)
{
delete this;
return;
}
if (want_streaming && parsed.body.size() > 0) if (want_streaming && parsed.body.size() > 0)
{ {
callback(&parsed); callback(&parsed);

View File

@ -10,6 +10,12 @@
#define WS_PING 9 #define WS_PING 9
#define WS_PONG 10 #define WS_PONG 10
struct http_options_t
{
int timeout;
bool want_streaming;
};
struct http_response_t struct http_response_t
{ {
bool eof = false; bool eof = false;

View File

@ -10,7 +10,7 @@ static void handle_sigint(int sig)
if (osd && !force_stopping) if (osd && !force_stopping)
{ {
force_stopping = true; force_stopping = true;
osd->force_stop(); osd->force_stop(0);
return; return;
} }
exit(0); exit(0);

View File

@ -168,12 +168,13 @@ void osd_t::handle_peers()
{ {
if (!p.second.peering_state->list_ops.size()) if (!p.second.peering_state->list_ops.size())
{ {
p.second.calc_object_states(); p.second.calc_object_states(log_level);
report_pg_state(p.second);
incomplete_objects += p.second.incomplete_objects.size(); incomplete_objects += p.second.incomplete_objects.size();
misplaced_objects += p.second.misplaced_objects.size(); misplaced_objects += p.second.misplaced_objects.size();
// FIXME: degraded objects may currently include misplaced, too! Report them separately? // FIXME: degraded objects may currently include misplaced, too! Report them separately?
degraded_objects += p.second.degraded_objects.size(); degraded_objects += p.second.degraded_objects.size();
if (p.second.state & PG_HAS_UNCLEAN) if (p.second.state & (PG_ACTIVE | PG_HAS_UNCLEAN) == (PG_ACTIVE | PG_HAS_UNCLEAN))
peering_state = peering_state | OSD_FLUSHING_PGS; peering_state = peering_state | OSD_FLUSHING_PGS;
else else
peering_state = peering_state | OSD_RECOVERING; peering_state = peering_state | OSD_RECOVERING;
@ -195,7 +196,7 @@ void osd_t::handle_peers()
bool still = false; bool still = false;
for (auto & p: pgs) for (auto & p: pgs)
{ {
if (p.second.state & PG_HAS_UNCLEAN) if (p.second.state & (PG_ACTIVE | PG_HAS_UNCLEAN) == (PG_ACTIVE | PG_HAS_UNCLEAN))
{ {
if (!p.second.flush_batch) if (!p.second.flush_batch)
{ {
@ -224,7 +225,7 @@ void osd_t::repeer_pgs(osd_num_t peer_osd)
for (auto & p: pgs) for (auto & p: pgs)
{ {
bool repeer = false; bool repeer = false;
if (p.second.state != PG_OFFLINE) if (p.second.state & (PG_PEERING | PG_ACTIVE | PG_INCOMPLETE))
{ {
for (osd_num_t pg_osd: p.second.all_peers) for (osd_num_t pg_osd: p.second.all_peers)
{ {
@ -239,7 +240,6 @@ void osd_t::repeer_pgs(osd_num_t peer_osd)
// Repeer this pg // Repeer this pg
printf("[PG %u] Repeer because of OSD %lu\n", p.second.pg_num, peer_osd); printf("[PG %u] Repeer because of OSD %lu\n", p.second.pg_num, peer_osd);
start_pg_peering(p.second.pg_num); start_pg_peering(p.second.pg_num);
peering_state |= OSD_PEERING_PGS;
} }
} }
} }
@ -250,7 +250,8 @@ void osd_t::start_pg_peering(pg_num_t pg_num)
{ {
auto & pg = pgs[pg_num]; auto & pg = pgs[pg_num];
pg.state = PG_PEERING; pg.state = PG_PEERING;
pg.print_state(); this->peering_state |= OSD_PEERING_PGS;
report_pg_state(pg);
// Reset PG state // Reset PG state
pg.state_dict.clear(); pg.state_dict.clear();
incomplete_objects -= pg.incomplete_objects.size(); incomplete_objects -= pg.incomplete_objects.size();
@ -312,14 +313,14 @@ void osd_t::start_pg_peering(pg_num_t pg_num)
if (!found) if (!found)
{ {
pg.state = PG_INCOMPLETE; pg.state = PG_INCOMPLETE;
pg.print_state(); report_pg_state(pg);
} }
} }
} }
if (pg.pg_cursize < pg.pg_minsize) if (pg.pg_cursize < pg.pg_minsize)
{ {
pg.state = PG_INCOMPLETE; pg.state = PG_INCOMPLETE;
pg.print_state(); report_pg_state(pg);
} }
std::set<osd_num_t> cur_peers; std::set<osd_num_t> cur_peers;
for (auto peer_osd: pg.all_peers) for (auto peer_osd: pg.all_peers)
@ -342,25 +343,7 @@ void osd_t::start_pg_peering(pg_num_t pg_num)
if (pg.state == PG_INCOMPLETE || cur_peers.find(it->first) == cur_peers.end()) if (pg.state == PG_INCOMPLETE || cur_peers.find(it->first) == cur_peers.end())
{ {
// Discard the result after completion, which, chances are, will be unsuccessful // Discard the result after completion, which, chances are, will be unsuccessful
auto list_op = it->second; discard_list_subop(it->second);
if (list_op->peer_fd == 0)
{
// Self
list_op->bs_op->callback = [list_op](blockstore_op_t *bs_op)
{
if (list_op->bs_op->buf)
free(list_op->bs_op->buf);
delete list_op;
};
}
else
{
// Peer
list_op->callback = [](osd_op_t *list_op)
{
delete list_op;
};
}
pg.peering_state->list_ops.erase(it); pg.peering_state->list_ops.erase(it);
it = pg.peering_state->list_ops.begin(); it = pg.peering_state->list_ops.begin();
} }
@ -491,6 +474,28 @@ void osd_t::submit_list_subop(osd_num_t role_osd, pg_peering_state_t *ps)
} }
} }
void osd_t::discard_list_subop(osd_op_t *list_op)
{
if (list_op->peer_fd == 0)
{
// Self
list_op->bs_op->callback = [list_op](blockstore_op_t *bs_op)
{
if (list_op->bs_op->buf)
free(list_op->bs_op->buf);
delete list_op;
};
}
else
{
// Peer
list_op->callback = [](osd_op_t *list_op)
{
delete list_op;
};
}
}
bool osd_t::stop_pg(pg_num_t pg_num) bool osd_t::stop_pg(pg_num_t pg_num)
{ {
auto pg_it = pgs.find(pg_num); auto pg_it = pgs.find(pg_num);
@ -499,19 +504,52 @@ bool osd_t::stop_pg(pg_num_t pg_num)
return false; return false;
} }
auto & pg = pg_it->second; auto & pg = pg_it->second;
if (pg.peering_state)
{
// Stop peering
for (auto it = pg.peering_state->list_ops.begin(); it != pg.peering_state->list_ops.end();)
{
discard_list_subop(it->second);
}
for (auto it = pg.peering_state->list_results.begin(); it != pg.peering_state->list_results.end();)
{
if (it->second.buf)
{
free(it->second.buf);
}
}
delete pg.peering_state;
pg.peering_state = NULL;
}
if (!(pg.state & PG_ACTIVE)) if (!(pg.state & PG_ACTIVE))
{ {
return false; return false;
} }
pg.state = pg.state & ~PG_ACTIVE | PG_STOPPING; pg.state = pg.state & ~PG_ACTIVE | PG_STOPPING;
if (pg.inflight == 0) if (pg.inflight == 0 && !pg.flush_batch)
{ {
finish_stop_pg(pg); finish_stop_pg(pg);
} }
else
{
report_pg_state(pg);
}
return true; return true;
} }
void osd_t::finish_stop_pg(pg_t & pg) void osd_t::finish_stop_pg(pg_t & pg)
{ {
pg.state = PG_OFFLINE; pg.state = PG_OFFLINE;
report_pg_state(pg);
}
void osd_t::report_pg_state(pg_t & pg)
{
pg.print_state();
this->pg_state_dirty.insert(pg.pg_num);
if (pg.state == PG_OFFLINE && !this->pg_config_applied)
{
apply_pg_config();
}
report_pg_states();
} }

View File

@ -43,6 +43,7 @@ struct pg_obj_state_check_t
uint64_t n_copies = 0, has_roles = 0, n_roles = 0, n_stable = 0, n_mismatched = 0; uint64_t n_copies = 0, has_roles = 0, n_roles = 0, n_stable = 0, n_mismatched = 0;
uint64_t n_unstable = 0, n_buggy = 0; uint64_t n_unstable = 0, n_buggy = 0;
pg_osd_set_t osd_set; pg_osd_set_t osd_set;
int log_level;
void walk(); void walk();
void start_object(); void start_object();
@ -198,14 +199,16 @@ void pg_obj_state_check_t::finish_object()
} }
if (n_roles < pg->pg_minsize) if (n_roles < pg->pg_minsize)
{ {
printf("Object is incomplete: inode=%lu stripe=%lu version=%lu/%lu\n", oid.inode, oid.stripe, target_ver, max_ver); if (log_level > 1)
for (int i = ver_start; i < ver_end; i++)
{ {
printf("Present on: osd %lu, role %ld%s\n", list[i].osd_num, (list[i].oid.stripe & STRIPE_MASK), list[i].is_stable ? " (stable)" : ""); printf("Object is incomplete: inode=%lu stripe=%lu version=%lu/%lu\n", oid.inode, oid.stripe, target_ver, max_ver);
for (int i = ver_start; i < ver_end; i++)
{
printf("Present on: osd %lu, role %ld%s\n", list[i].osd_num, (list[i].oid.stripe & STRIPE_MASK), list[i].is_stable ? " (stable)" : "");
}
} }
if (0) if (log_level > 2)
{ {
// For future debug level
for (int i = obj_start; i < obj_end; i++) for (int i = obj_start; i < obj_end; i++)
{ {
printf("v%lu present on: osd %lu, role %ld%s\n", list[i].version, list[i].osd_num, (list[i].oid.stripe & STRIPE_MASK), list[i].is_stable ? " (stable)" : ""); printf("v%lu present on: osd %lu, role %ld%s\n", list[i].version, list[i].osd_num, (list[i].oid.stripe & STRIPE_MASK), list[i].is_stable ? " (stable)" : "");
@ -216,10 +219,13 @@ void pg_obj_state_check_t::finish_object()
} }
else if (n_roles < pg->pg_cursize) else if (n_roles < pg->pg_cursize)
{ {
printf("Object is degraded: inode=%lu stripe=%lu version=%lu/%lu\n", oid.inode, oid.stripe, target_ver, max_ver); if (log_level > 1)
for (int i = ver_start; i < ver_end; i++)
{ {
printf("Present on: osd %lu, role %ld%s\n", list[i].osd_num, (list[i].oid.stripe & STRIPE_MASK), list[i].is_stable ? " (stable)" : ""); printf("Object is degraded: inode=%lu stripe=%lu version=%lu/%lu\n", oid.inode, oid.stripe, target_ver, max_ver);
for (int i = ver_start; i < ver_end; i++)
{
printf("Present on: osd %lu, role %ld%s\n", list[i].osd_num, (list[i].oid.stripe & STRIPE_MASK), list[i].is_stable ? " (stable)" : "");
}
} }
state = OBJ_DEGRADED; state = OBJ_DEGRADED;
pg->state = pg->state | PG_HAS_DEGRADED; pg->state = pg->state | PG_HAS_DEGRADED;
@ -321,10 +327,11 @@ void pg_obj_state_check_t::finish_object()
} }
// FIXME: Write at least some tests for this function // FIXME: Write at least some tests for this function
void pg_t::calc_object_states() void pg_t::calc_object_states(int log_level)
{ {
// Copy all object lists into one array // Copy all object lists into one array
pg_obj_state_check_t st; pg_obj_state_check_t st;
st.log_level = log_level;
st.pg = this; st.pg = this;
auto ps = peering_state; auto ps = peering_state;
for (auto it: ps->list_results) for (auto it: ps->list_results)
@ -352,12 +359,10 @@ void pg_t::calc_object_states()
std::sort(st.list.begin(), st.list.end()); std::sort(st.list.begin(), st.list.end());
// Walk over it and check object states // Walk over it and check object states
st.walk(); st.walk();
print_state();
} }
void pg_t::print_state() void pg_t::print_state()
{ {
// FIXME Immediately report state on each change
printf( printf(
"[PG %u] is %s%s%s%s%s%s%s%s%s (%lu objects)\n", pg_num, "[PG %u] is %s%s%s%s%s%s%s%s%s (%lu objects)\n", pg_num,
(state & PG_OFFLINE) ? "offline" : "", (state & PG_OFFLINE) ? "offline" : "",
@ -373,13 +378,15 @@ void pg_t::print_state()
); );
} }
const int pg_state_bit_count = 10; const int pg_state_bit_count = 12;
const int pg_state_bits[10] = { const int pg_state_bits[12] = {
PG_OFFLINE, PG_STARTING,
PG_PEERING, PG_PEERING,
PG_INCOMPLETE, PG_INCOMPLETE,
PG_ACTIVE, PG_ACTIVE,
PG_STOPPING,
PG_OFFLINE,
PG_DEGRADED, PG_DEGRADED,
PG_HAS_INCOMPLETE, PG_HAS_INCOMPLETE,
PG_HAS_DEGRADED, PG_HAS_DEGRADED,
@ -387,11 +394,13 @@ const int pg_state_bits[10] = {
PG_HAS_UNCLEAN, PG_HAS_UNCLEAN,
}; };
const char *pg_state_names[10] = { const char *pg_state_names[12] = {
"offline", "starting",
"peering", "peering",
"incomplete", "incomplete",
"active", "active",
"stopping",
"offline",
"degraded", "degraded",
"has_incomplete", "has_incomplete",
"has_degraded", "has_degraded",

View File

@ -9,18 +9,20 @@
#include "osd_ops.h" #include "osd_ops.h"
// Placement group states // Placement group states
// STARTING -> [acquire lock] -> PEERING -> INCOMPLETE|ACTIVE -> STOPPING -> OFFLINE -> [release lock]
// Exactly one of these: // Exactly one of these:
#define PG_OFFLINE (1<<0) #define PG_STARTING (1<<0)
#define PG_PEERING (1<<1) #define PG_PEERING (1<<1)
#define PG_INCOMPLETE (1<<2) #define PG_INCOMPLETE (1<<2)
#define PG_ACTIVE (1<<3) #define PG_ACTIVE (1<<3)
#define PG_STOPPING (1<<4) #define PG_STOPPING (1<<4)
#define PG_OFFLINE (1<<5)
// Plus any of these: // Plus any of these:
#define PG_DEGRADED (1<<5) #define PG_DEGRADED (1<<6)
#define PG_HAS_INCOMPLETE (1<<6) #define PG_HAS_INCOMPLETE (1<<7)
#define PG_HAS_DEGRADED (1<<7) #define PG_HAS_DEGRADED (1<<8)
#define PG_HAS_MISPLACED (1<<8) #define PG_HAS_MISPLACED (1<<9)
#define PG_HAS_UNCLEAN (1<<9) #define PG_HAS_UNCLEAN (1<<10)
// FIXME: Safe default that doesn't depend on pg_stripe_size or pg_block_size // FIXME: Safe default that doesn't depend on pg_stripe_size or pg_block_size
#define STRIPE_MASK ((uint64_t)4096 - 1) #define STRIPE_MASK ((uint64_t)4096 - 1)
@ -33,8 +35,8 @@
#define OBJ_NEEDS_ROLLBACK 0x20000 #define OBJ_NEEDS_ROLLBACK 0x20000
#define OBJ_BUGGY 0x80000 #define OBJ_BUGGY 0x80000
extern const int pg_state_bits[10]; extern const int pg_state_bits[];
extern const char *pg_state_names[10]; extern const char *pg_state_names[];
extern const int pg_state_bit_count; extern const int pg_state_bit_count;
struct pg_obj_loc_t struct pg_obj_loc_t
@ -96,7 +98,7 @@ struct pg_flush_batch_t
struct pg_t struct pg_t
{ {
int state = PG_OFFLINE; int state = 0;
uint64_t pg_cursize = 3, pg_size = 3, pg_minsize = 2; uint64_t pg_cursize = 3, pg_size = 3, pg_minsize = 2;
pg_num_t pg_num; pg_num_t pg_num;
uint64_t clean_count = 0, total_count = 0; uint64_t clean_count = 0, total_count = 0;
@ -122,7 +124,7 @@ struct pg_t
int inflight = 0; // including write_queue int inflight = 0; // including write_queue
std::multimap<object_id, osd_op_t*> write_queue; std::multimap<object_id, osd_op_t*> write_queue;
void calc_object_states(); void calc_object_states(int log_level);
void print_state(); void print_state();
}; };

View File

@ -1,6 +1,7 @@
#define _LARGEFILE64_SOURCE #define _LARGEFILE64_SOURCE
#include "osd_peering_pg.h" #include "osd_peering_pg.h"
#define STRIPE_SHIFT 12
/** /**
* TODO tests for object & pg state calculation. * TODO tests for object & pg state calculation.
@ -43,7 +44,7 @@ int main(int argc, char *argv[])
} }
pg.peering_state->list_results[osd_num] = r; pg.peering_state->list_results[osd_num] = r;
} }
pg.calc_object_states(); pg.calc_object_states(0);
printf("deviation variants=%ld clean=%lu\n", pg.state_dict.size(), pg.clean_count); printf("deviation variants=%ld clean=%lu\n", pg.state_dict.size(), pg.clean_count);
for (auto it: pg.state_dict) for (auto it: pg.state_dict)
{ {

View File

@ -51,7 +51,7 @@ void osd_t::finish_op(osd_op_t *cur_op, int retval)
auto & pg = pgs[cur_op->op_data->pg_num]; auto & pg = pgs[cur_op->op_data->pg_num];
int n = --pg.inflight; int n = --pg.inflight;
assert(n >= 0); assert(n >= 0);
if (n == 0 && (pg.state & PG_STOPPING)) if ((pg.state & PG_STOPPING) && n == 0 && !pg.flush_batch)
{ {
finish_stop_pg(pg); finish_stop_pg(pg);
} }
@ -513,7 +513,7 @@ resume_8:
if (!pg.incomplete_objects.size()) if (!pg.incomplete_objects.size())
{ {
pg.state = pg.state & ~PG_HAS_INCOMPLETE; pg.state = pg.state & ~PG_HAS_INCOMPLETE;
pg.print_state(); report_pg_state(pg);
} }
} }
else if (op_data->object_state->state & OBJ_DEGRADED) else if (op_data->object_state->state & OBJ_DEGRADED)
@ -523,7 +523,7 @@ resume_8:
if (!pg.degraded_objects.size()) if (!pg.degraded_objects.size())
{ {
pg.state = pg.state & ~PG_HAS_DEGRADED; pg.state = pg.state & ~PG_HAS_DEGRADED;
pg.print_state(); report_pg_state(pg);
} }
} }
else if (op_data->object_state->state & OBJ_MISPLACED) else if (op_data->object_state->state & OBJ_MISPLACED)
@ -533,7 +533,7 @@ resume_8:
if (!pg.misplaced_objects.size()) if (!pg.misplaced_objects.size())
{ {
pg.state = pg.state & ~PG_HAS_MISPLACED; pg.state = pg.state & ~PG_HAS_MISPLACED;
pg.print_state(); report_pg_state(pg);
} }
} }
else else