diff --git a/README.md b/README.md index 14c3cadc..66d979c0 100644 --- a/README.md +++ b/README.md @@ -357,9 +357,6 @@ and calculate disk offsets almost by hand. This will be fixed in near future. ## Known Problems -- OSDs may currently crash with "can't get SQE, will fall out of sync with EPOLLET" - if you try to load them with very long iodepths because io_uring queue (ring) is limited - and OSDs don't check if it fills up. - Object deletion requests may currently lead to 'incomplete' objects if your OSDs crash during deletion because proper handling of object cleanup in a cluster should be "three-phase" and it's currently not implemented. Inode removal tool currently can't handle unclean diff --git a/msgr_send.cpp b/msgr_send.cpp index bef6679d..fe59ea4a 100644 --- a/msgr_send.cpp +++ b/msgr_send.cpp @@ -122,9 +122,6 @@ bool osd_messenger_t::try_send(osd_client_t *cl) { return true; } - cl->write_msg.msg_iov = cl->send_list.data(); - cl->write_msg.msg_iovlen = cl->send_list.size(); - cl->refs++; if (ringloop && !use_sync_send_recv) { io_uring_sqe* sqe = ringloop->get_sqe(); @@ -132,12 +129,18 @@ bool osd_messenger_t::try_send(osd_client_t *cl) { return false; } + cl->write_msg.msg_iov = cl->send_list.data(); + cl->write_msg.msg_iovlen = cl->send_list.size(); + cl->refs++; ring_data_t* data = ((ring_data_t*)sqe->user_data); data->callback = [this, cl](ring_data_t *data) { handle_send(data->res, cl); }; my_uring_prep_sendmsg(sqe, peer_fd, &cl->write_msg, 0); } else { + cl->write_msg.msg_iov = cl->send_list.data(); + cl->write_msg.msg_iovlen = cl->send_list.size(); + cl->refs++; int result = sendmsg(peer_fd, &cl->write_msg, MSG_NOSIGNAL); if (result < 0) { diff --git a/ringloop.cpp b/ringloop.cpp index a7532f0e..d28b0a69 100644 --- a/ringloop.cpp +++ b/ringloop.cpp @@ -66,10 +66,18 @@ void ring_loop_t::loop() struct ring_data_t *d = (struct ring_data_t*)cqe->user_data; if (d->callback) { - d->res = cqe->res; - d->callback(d); + // First free ring_data item, then call the callback + // so it has at least 1 free slot for the next event + // which is required for EPOLLET to function properly + struct ring_data_t dl; + dl.iov = d->iov; + dl.res = cqe->res; + dl.callback.swap(d->callback); + free_ring_data[free_ring_data_ptr++] = d - ring_datas; + dl.callback(&dl); } - free_ring_data[free_ring_data_ptr++] = d - ring_datas; + else + free_ring_data[free_ring_data_ptr++] = d - ring_datas; io_uring_cqe_seen(&ring, cqe); } while (get_sqe_queue.size() > 0)