Simplified distributed block storage with strong consistency, like in Ceph
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

207 lines
6.6 KiB

#include "osd.h"
bool osd_t::try_receive(osd_client_t & cl)
{
int peer_fd = cl.peer_fd;
io_uring_sqe* sqe = ringloop->get_sqe();
if (!sqe)
{
return false;
}
ring_data_t* data = ((ring_data_t*)sqe->user_data);
if (!cl.read_buf)
{
// no reads in progress
// so this is either a new command or a reply to a previously sent command
if (!cl.read_op)
{
cl.read_op = new osd_op_t;
cl.read_op->peer_fd = peer_fd;
}
cl.read_op->op_type = OSD_OP_IN;
cl.read_buf = &cl.read_op->req.buf;
cl.read_remaining = OSD_PACKET_SIZE;
cl.read_state = CL_READ_OP;
}
cl.read_iov.iov_base = cl.read_buf;
cl.read_iov.iov_len = cl.read_remaining;
cl.read_msg.msg_iov = &cl.read_iov;
cl.read_msg.msg_iovlen = 1;
data->callback = [this, peer_fd](ring_data_t *data) { handle_read(data, peer_fd); };
my_uring_prep_recvmsg(sqe, peer_fd, &cl.read_msg, 0);
return true;
}
void osd_t::read_requests()
{
for (auto & p: clients)
{
if (p.second.peer_state == PEER_CONNECTED && p.second.read_iov.iov_len == 0)
{
try_receive(p.second);
}
}
}
void osd_t::handle_read(ring_data_t *data, int peer_fd)
{
auto cl_it = clients.find(peer_fd);
if (cl_it != clients.end())
{
auto & cl = cl_it->second;
cl.read_iov.iov_len = 0;
if (data->res == -EAGAIN)
{
return;
}
else if (data->res < 0)
{
// this is a client socket, so don't panic. just disconnect it
printf("Client %d socket read error: %d (%s). Disconnecting client\n", peer_fd, -data->res, strerror(-data->res));
stop_client(peer_fd);
return;
}
if (data->res > 0)
{
cl.read_remaining -= data->res;
cl.read_buf += data->res;
if (cl.read_remaining <= 0)
{
cl.read_buf = NULL;
if (cl.read_state == CL_READ_OP)
{
if (cl.read_op->req.hdr.magic == SECONDARY_OSD_REPLY_MAGIC)
{
handle_reply_hdr(&cl);
}
else
{
handle_op_hdr(&cl);
}
}
else if (cl.read_state == CL_READ_DATA)
{
// Operation is ready
exec_op(cl.read_op);
cl.read_op = NULL;
cl.read_state = 0;
}
else if (cl.read_state == CL_READ_REPLY_DATA)
{
// Reply is ready
auto req_it = cl.sent_ops.find(cl.read_reply_id);
osd_op_t *request = req_it->second;
cl.sent_ops.erase(req_it);
cl.read_reply_id = 0;
cl.read_state = 0;
// Measure subop latency
timeval tv_end;
gettimeofday(&tv_end, NULL);
subop_stat_count[request->req.hdr.opcode]++;
subop_stat_sum[request->req.hdr.opcode] += (
(tv_end.tv_sec - request->tv_begin.tv_sec)*1000000 +
tv_end.tv_usec - request->tv_begin.tv_usec
);
request->callback(request);
}
}
}
}
}
void osd_t::handle_op_hdr(osd_client_t *cl)
{
osd_op_t *cur_op = cl->read_op;
if (cur_op->req.hdr.opcode == OSD_OP_SECONDARY_READ)
{
if (cur_op->req.sec_rw.len > 0)
cur_op->buf = memalign(512, cur_op->req.sec_rw.len);
cl->read_remaining = 0;
}
else if (cur_op->req.hdr.opcode == OSD_OP_SECONDARY_WRITE)
{
if (cur_op->req.sec_rw.len > 0)
cur_op->buf = memalign(512, cur_op->req.sec_rw.len);
cl->read_remaining = cur_op->req.sec_rw.len;
}
else if (cur_op->req.hdr.opcode == OSD_OP_SECONDARY_STABILIZE ||
cur_op->req.hdr.opcode == OSD_OP_SECONDARY_ROLLBACK)
{
if (cur_op->req.sec_stab.len > 0)
cur_op->buf = memalign(512, cur_op->req.sec_stab.len);
cl->read_remaining = cur_op->req.sec_stab.len;
}
else if (cur_op->req.hdr.opcode == OSD_OP_READ)
{
if (cur_op->req.rw.len > 0)
cur_op->buf = memalign(512, cur_op->req.rw.len);
cl->read_remaining = 0;
}
else if (cur_op->req.hdr.opcode == OSD_OP_WRITE)
{
if (cur_op->req.rw.len > 0)
cur_op->buf = memalign(512, cur_op->req.rw.len);
cl->read_remaining = cur_op->req.rw.len;
}
if (cl->read_remaining > 0)
{
// Read data
cl->read_buf = cur_op->buf;
cl->read_state = CL_READ_DATA;
}
else
{
// Operation is ready
cl->read_op = NULL;
cl->read_state = 0;
exec_op(cur_op);
}
}
void osd_t::handle_reply_hdr(osd_client_t *cl)
{
osd_op_t *cur_op = cl->read_op;
auto req_it = cl->sent_ops.find(cur_op->req.hdr.id);
if (req_it == cl->sent_ops.end())
{
// Command out of sync. Drop connection
printf("Client %d command out of sync: id %lu\n", cl->peer_fd, cur_op->req.hdr.id);
stop_client(cl->peer_fd);
return;
}
osd_op_t *op = req_it->second;
memcpy(op->reply.buf, cur_op->req.buf, OSD_PACKET_SIZE);
if (op->reply.hdr.opcode == OSD_OP_SECONDARY_READ &&
op->reply.hdr.retval > 0)
{
// Read data. In this case we assume that the buffer is preallocated by the caller (!)
assert(op->buf);
cl->read_state = CL_READ_REPLY_DATA;
cl->read_reply_id = op->req.hdr.id;
cl->read_buf = op->buf;
cl->read_remaining = op->reply.hdr.retval;
}
else if (op->reply.hdr.opcode == OSD_OP_SECONDARY_LIST &&
op->reply.hdr.retval > 0)
{
op->buf = memalign(512, sizeof(obj_ver_id) * op->reply.hdr.retval);
cl->read_state = CL_READ_REPLY_DATA;
cl->read_reply_id = op->req.hdr.id;
cl->read_buf = op->buf;
cl->read_remaining = sizeof(obj_ver_id) * op->reply.hdr.retval;
}
else
{
cl->read_state = 0;
cl->sent_ops.erase(req_it);
// Measure subop latency
timeval tv_end;
gettimeofday(&tv_end, NULL);
subop_stat_count[op->req.hdr.opcode]++;
subop_stat_sum[op->req.hdr.opcode] += (
(tv_end.tv_sec - op->tv_begin.tv_sec)*1000000 +
tv_end.tv_usec - op->tv_begin.tv_usec
);
op->callback(op);
}
}