vitastor/osd_cluster.cpp

367 lines
9.8 KiB
C++

#include <netinet/tcp.h>
#include <sys/epoll.h>
#include <net/if.h>
#include <ifaddrs.h>
#include "osd.h"
static int extract_port(std::string & host)
{
int port = 0;
int pos = 0;
if ((pos = host.find(':')) >= 0)
{
port = strtoull(host.c_str() + pos + 1, NULL, 10);
if (port >= 0x10000)
{
port = 0;
}
host = host.substr(0, pos);
}
return port;
}
std::vector<std::string> getifaddr_list()
{
std::vector<std::string> addresses;
ifaddrs *list, *ifa;
if (getifaddrs(&list) == -1)
{
throw std::runtime_error(std::string("getifaddrs: ") + strerror(errno));
}
for (ifa = list; ifa != NULL; ifa = ifa->ifa_next)
{
if (!ifa->ifa_addr)
{
continue;
}
int family = ifa->ifa_addr->sa_family;
if ((family == AF_INET || family == AF_INET6) &&
(ifa->ifa_flags & (IFF_UP | IFF_RUNNING | IFF_LOOPBACK)) == (IFF_UP | IFF_RUNNING))
{
void *addr_ptr;
if (family == AF_INET)
addr_ptr = &((sockaddr_in *)ifa->ifa_addr)->sin_addr;
else
addr_ptr = &((sockaddr_in6 *)ifa->ifa_addr)->sin6_addr;
char addr[INET6_ADDRSTRLEN];
if (!inet_ntop(family, addr_ptr, addr, INET6_ADDRSTRLEN))
{
throw std::runtime_error(std::string("inet_ntop: ") + strerror(errno));
}
addresses.push_back(std::string(addr));
}
}
freeifaddrs(list);
return addresses;
}
json11::Json osd_t::get_status()
{
json11::Json::object st;
st["state"] = "up";
if (bind_address != "0.0.0.0")
st["addresses"] = { bind_address };
else
{
if (bind_addresses.size() == 0)
bind_addresses = getifaddr_list();
st["addresses"] = bind_addresses;
}
st["port"] = bind_port;
st["blockstore_enabled"] = bs ? true : false;
if (bs)
{
st["size"] = bs->get_block_count() * bs->get_block_size();
st["free"] = bs->get_free_block_count() * bs->get_block_size();
}
json11::Json::object pg_status;
for (auto & p: pgs)
{
auto & pg = p.second;
json11::Json::object pg_st;
json11::Json::array pg_state;
for (int i = 0; i < pg_state_bit_count; i++)
if (pg.state & pg_state_bits[i])
pg_state.push_back(pg_state_names[i]);
pg_st["state"] = pg_state;
pg_st["object_count"] = pg.total_count;
pg_st["clean_count"] = pg.clean_count;
pg_st["misplaced_count"] = pg.misplaced_objects.size();
pg_st["degraded_count"] = pg.degraded_objects.size();
pg_st["incomplete_count"] = pg.incomplete_objects.size();
pg_st["write_osd_set"] = pg.cur_set;
pg_status[std::to_string(pg.pg_num)] = pg_st;
}
st["pgs"] = pg_status;
json11::Json::object op_stats, subop_stats;
for (int i = 0; i <= OSD_OP_MAX; i++)
{
op_stats[osd_op_names[i]] = json11::Json::object {
{ "count", op_stat_count[0][i] },
{ "sum", op_stat_sum[0][i] },
};
}
for (int i = 0; i <= OSD_OP_MAX; i++)
{
subop_stats[osd_op_names[i]] = json11::Json::object {
{ "count", subop_stat_count[0][i] },
{ "sum", subop_stat_sum[0][i] },
};
}
st["op_latency"] = op_stats;
st["subop_latency"] = subop_stats;
return st;
}
void osd_t::report_status()
{
if (consul_host == "")
{
consul_host = consul_address;
extract_port(consul_host);
}
std::string st = get_status().dump();
std::string req = "PUT /v1/kv/"+consul_prefix+"/osd/"+std::to_string(osd_num)+" HTTP/1.1\r\n"+
"Host: "+consul_host+"\r\n"+
"Content-Length: "+std::to_string(st.size())+"\r\n"+
"Connection: close\r\n"
"\r\n"+st;
http_request(consul_address, req, [this](int err, std::string res)
{
int pos = res.find("\r\n\r\n");
if (pos >= 0)
res = res.substr(pos+4);
if (err != 0 || res != "true")
printf("Error reporting state to Consul: code %d (%s), response text: %s\n", err, strerror(err), res.c_str());
});
}
struct http_co_t
{
osd_t *osd;
std::string host;
std::string request;
std::string response;
std::vector<char> rbuf;
int st = 0;
int peer_fd = -1;
int epoll_events = 0;
int code = 0;
int sent = 0, received = 0;
iovec iov;
msghdr msg = { 0 };
int cqe_res = 0;
std::function<void(int, std::string)> callback;
std::function<void(int, int)> epoll_handler;
~http_co_t();
void resume();
};
void osd_t::http_request(std::string host, std::string request, std::function<void(int, std::string)> callback)
{
http_co_t *handler = new http_co_t();
handler->osd = this;
handler->host = host;
handler->request = request;
handler->callback = callback;
handler->epoll_handler = [this, handler](int peer_fd, int epoll_events)
{
handler->epoll_events |= epoll_events;
handler->resume();
};
handler->resume();
}
http_co_t::~http_co_t()
{
callback(code, response);
if (peer_fd >= 0)
{
osd->epoll_handlers.erase(peer_fd);
epoll_ctl(osd->epoll_fd, EPOLL_CTL_DEL, peer_fd, NULL);
close(peer_fd);
peer_fd = -1;
}
}
void http_co_t::resume()
{
if (st == 0)
{
int port = extract_port(host);
struct sockaddr_in addr;
int r;
if ((r = inet_pton(AF_INET, host.c_str(), &addr.sin_addr)) != 1)
{
code = ENXIO;
delete this;
return;
}
addr.sin_family = AF_INET;
addr.sin_port = htons(port ? port : 80);
peer_fd = socket(AF_INET, SOCK_STREAM, 0);
if (peer_fd < 0)
{
code = errno;
delete this;
return;
}
fcntl(peer_fd, F_SETFL, fcntl(peer_fd, F_GETFL, 0) | O_NONBLOCK);
r = connect(peer_fd, (sockaddr*)&addr, sizeof(addr));
if (r < 0 && errno != EINPROGRESS)
{
code = errno;
delete this;
return;
}
osd->epoll_handlers[peer_fd] = epoll_handler;
// Add FD to epoll (EPOLLOUT for tracking connect() result)
epoll_event ev;
ev.data.fd = peer_fd;
ev.events = EPOLLOUT | EPOLLIN | EPOLLRDHUP | EPOLLET;
if (epoll_ctl(osd->epoll_fd, EPOLL_CTL_ADD, peer_fd, &ev) < 0)
{
code = errno;
delete this;
return;
}
epoll_events = 0;
st = 1;
return;
}
if (st == 1)
{
if (epoll_events & (EPOLLOUT | EPOLLERR))
{
int result = 0;
socklen_t result_len = sizeof(result);
if (getsockopt(peer_fd, SOL_SOCKET, SO_ERROR, &result, &result_len) < 0)
{
result = errno;
}
if (result != 0)
{
code = result;
delete this;
return;
}
int one = 1;
setsockopt(peer_fd, SOL_TCP, TCP_NODELAY, &one, sizeof(one));
// Disable EPOLLOUT on this fd
epoll_event ev;
ev.data.fd = peer_fd;
ev.events = EPOLLIN | EPOLLRDHUP | EPOLLET;
if (epoll_ctl(osd->epoll_fd, EPOLL_CTL_MOD, peer_fd, &ev) < 0)
{
code = errno;
delete this;
return;
}
st = 2;
epoll_events = 0;
resume();
return;
}
else if (epoll_events & EPOLLRDHUP)
{
delete this;
return;
}
else
{
return;
}
}
// Write data
if (st == 2)
{
io_uring_sqe *sqe = osd->ringloop->get_sqe();
if (!sqe)
return;
ring_data_t* data = ((ring_data_t*)sqe->user_data);
iov = (iovec){ .iov_base = (void*)(request.c_str()+sent), .iov_len = request.size()-sent };
msg.msg_iov = &iov;
msg.msg_iovlen = 1;
data->callback = [this](ring_data_t *data)
{
st = 4;
cqe_res = data->res;
resume();
};
my_uring_prep_sendmsg(sqe, peer_fd, &msg, 0);
st = 3;
return;
}
if (st == 3)
{
return;
}
if (st == 4)
{
if (cqe_res < 0 && cqe_res != -EAGAIN)
{
delete this;
return;
}
sent += cqe_res;
if (sent < request.size())
st = 2;
else
st = 5;
resume();
return;
}
// Read response
if (st == 5)
{
if (epoll_events & (EPOLLRDHUP|EPOLLERR))
{
delete this;
return;
}
else if (epoll_events & EPOLLIN)
{
if (rbuf.size() != 9000)
rbuf.resize(9000);
io_uring_sqe *sqe = osd->ringloop->get_sqe();
if (!sqe)
return;
ring_data_t* data = ((ring_data_t*)sqe->user_data);
iov = { .iov_base = rbuf.data(), .iov_len = 9000 };
msg.msg_iov = &iov;
msg.msg_iovlen = 1;
data->callback = [this](ring_data_t *data)
{
st = 7;
cqe_res = data->res;
resume();
};
my_uring_prep_recvmsg(sqe, peer_fd, &msg, 0);
st = 6;
epoll_events = 0;
}
}
if (st == 6)
{
return;
}
if (st == 7)
{
if (cqe_res < 0 && cqe_res != -EAGAIN)
{
delete this;
return;
}
response += std::string(rbuf.data(), cqe_res);
received += cqe_res;
st = 5;
resume();
return;
}
}