Browse Source

Fix "can't get SQE, will fall out of sync with EPOLLET" when overflowing the ring

OSDs shouldn't crash or hang with long iodepths anymore
tags/v0.5.1
Vitaliy Filippov 2 months ago
parent
commit
23ea409081
3 changed files with 17 additions and 9 deletions
  1. +0
    -3
      README.md
  2. +6
    -3
      msgr_send.cpp
  3. +11
    -3
      ringloop.cpp

+ 0
- 3
README.md View File

@@ -357,9 +357,6 @@ and calculate disk offsets almost by hand. This will be fixed in near future.

## Known Problems

- OSDs may currently crash with "can't get SQE, will fall out of sync with EPOLLET"
if you try to load them with very long iodepths because io_uring queue (ring) is limited
and OSDs don't check if it fills up.
- Object deletion requests may currently lead to 'incomplete' objects if your OSDs crash during
deletion because proper handling of object cleanup in a cluster should be "three-phase"
and it's currently not implemented. Inode removal tool currently can't handle unclean


+ 6
- 3
msgr_send.cpp View File

@@ -122,9 +122,6 @@ bool osd_messenger_t::try_send(osd_client_t *cl)
{
return true;
}
cl->write_msg.msg_iov = cl->send_list.data();
cl->write_msg.msg_iovlen = cl->send_list.size();
cl->refs++;
if (ringloop && !use_sync_send_recv)
{
io_uring_sqe* sqe = ringloop->get_sqe();
@@ -132,12 +129,18 @@ bool osd_messenger_t::try_send(osd_client_t *cl)
{
return false;
}
cl->write_msg.msg_iov = cl->send_list.data();
cl->write_msg.msg_iovlen = cl->send_list.size();
cl->refs++;
ring_data_t* data = ((ring_data_t*)sqe->user_data);
data->callback = [this, cl](ring_data_t *data) { handle_send(data->res, cl); };
my_uring_prep_sendmsg(sqe, peer_fd, &cl->write_msg, 0);
}
else
{
cl->write_msg.msg_iov = cl->send_list.data();
cl->write_msg.msg_iovlen = cl->send_list.size();
cl->refs++;
int result = sendmsg(peer_fd, &cl->write_msg, MSG_NOSIGNAL);
if (result < 0)
{


+ 11
- 3
ringloop.cpp View File

@@ -66,10 +66,18 @@ void ring_loop_t::loop()
struct ring_data_t *d = (struct ring_data_t*)cqe->user_data;
if (d->callback)
{
d->res = cqe->res;
d->callback(d);
// First free ring_data item, then call the callback
// so it has at least 1 free slot for the next event
// which is required for EPOLLET to function properly
struct ring_data_t dl;
dl.iov = d->iov;
dl.res = cqe->res;
dl.callback.swap(d->callback);
free_ring_data[free_ring_data_ptr++] = d - ring_datas;
dl.callback(&dl);
}
free_ring_data[free_ring_data_ptr++] = d - ring_datas;
else
free_ring_data[free_ring_data_ptr++] = d - ring_datas;
io_uring_cqe_seen(&ring, cqe);
}
while (get_sqe_queue.size() > 0)


Loading…
Cancel
Save