Browse Source

Fix "can't get SQE, will fall out of sync with EPOLLET" when overflowing the ring

OSDs shouldn't crash or hang with long iodepths anymore
Vitaliy Filippov 11 months ago
parent
commit
23ea409081
  1. 3
      README.md
  2. 9
      msgr_send.cpp
  3. 14
      ringloop.cpp

3
README.md

@ -357,9 +357,6 @@ and calculate disk offsets almost by hand. This will be fixed in near future.
## Known Problems
- OSDs may currently crash with "can't get SQE, will fall out of sync with EPOLLET"
if you try to load them with very long iodepths because io_uring queue (ring) is limited
and OSDs don't check if it fills up.
- Object deletion requests may currently lead to 'incomplete' objects if your OSDs crash during
deletion because proper handling of object cleanup in a cluster should be "three-phase"
and it's currently not implemented. Inode removal tool currently can't handle unclean

9
msgr_send.cpp

@ -122,9 +122,6 @@ bool osd_messenger_t::try_send(osd_client_t *cl)
{
return true;
}
cl->write_msg.msg_iov = cl->send_list.data();
cl->write_msg.msg_iovlen = cl->send_list.size();
cl->refs++;
if (ringloop && !use_sync_send_recv)
{
io_uring_sqe* sqe = ringloop->get_sqe();
@ -132,12 +129,18 @@ bool osd_messenger_t::try_send(osd_client_t *cl)
{
return false;
}
cl->write_msg.msg_iov = cl->send_list.data();
cl->write_msg.msg_iovlen = cl->send_list.size();
cl->refs++;
ring_data_t* data = ((ring_data_t*)sqe->user_data);
data->callback = [this, cl](ring_data_t *data) { handle_send(data->res, cl); };
my_uring_prep_sendmsg(sqe, peer_fd, &cl->write_msg, 0);
}
else
{
cl->write_msg.msg_iov = cl->send_list.data();
cl->write_msg.msg_iovlen = cl->send_list.size();
cl->refs++;
int result = sendmsg(peer_fd, &cl->write_msg, MSG_NOSIGNAL);
if (result < 0)
{

14
ringloop.cpp

@ -66,10 +66,18 @@ void ring_loop_t::loop()
struct ring_data_t *d = (struct ring_data_t*)cqe->user_data;
if (d->callback)
{
d->res = cqe->res;
d->callback(d);
// First free ring_data item, then call the callback
// so it has at least 1 free slot for the next event
// which is required for EPOLLET to function properly
struct ring_data_t dl;
dl.iov = d->iov;
dl.res = cqe->res;
dl.callback.swap(d->callback);
free_ring_data[free_ring_data_ptr++] = d - ring_datas;
dl.callback(&dl);
}
free_ring_data[free_ring_data_ptr++] = d - ring_datas;
else
free_ring_data[free_ring_data_ptr++] = d - ring_datas;
io_uring_cqe_seen(&ring, cqe);
}
while (get_sqe_queue.size() > 0)

Loading…
Cancel
Save