diff --git a/blockstore_flush.cpp b/blockstore_flush.cpp index 1d53158e..fc858462 100644 --- a/blockstore_flush.cpp +++ b/blockstore_flush.cpp @@ -287,8 +287,9 @@ resume_1: data->iov = (struct iovec){ it->buf, (size_t)it->len }; data->callback = simple_callback_w; my_uring_prep_writev( - sqe, bs->data_fd, &data->iov, 1, bs->data_offset + clean_loc + it->offset + sqe, bs->data_fd_index, &data->iov, 1, bs->data_offset + clean_loc + it->offset ); + sqe->flags |= IOSQE_FIXED_FILE; wait_count++; } // Sync data before writing metadata @@ -320,8 +321,9 @@ resume_1: data->iov = (struct iovec){ meta_old.buf, bs->meta_block_size }; data->callback = simple_callback_w; my_uring_prep_writev( - sqe, bs->meta_fd, &data->iov, 1, bs->meta_offset + meta_old.sector + sqe, bs->meta_fd_index, &data->iov, 1, bs->meta_offset + meta_old.sector ); + sqe->flags |= IOSQE_FIXED_FILE; wait_count++; } if (has_delete) @@ -342,8 +344,9 @@ resume_1: data->iov = (struct iovec){ meta_new.buf, bs->meta_block_size }; data->callback = simple_callback_w; my_uring_prep_writev( - sqe, bs->meta_fd, &data->iov, 1, bs->meta_offset + meta_new.sector + sqe, bs->meta_fd_index, &data->iov, 1, bs->meta_offset + meta_new.sector ); + sqe->flags |= IOSQE_FIXED_FILE; wait_count++; resume_7: if (wait_count > 0) @@ -405,7 +408,8 @@ resume_1: ((journal_entry_start*)flusher->journal_superblock)->crc32 = je_crc32((journal_entry*)flusher->journal_superblock); data->iov = (struct iovec){ flusher->journal_superblock, bs->journal_block_size }; data->callback = simple_callback_w; - my_uring_prep_writev(sqe, bs->journal.fd, &data->iov, 1, bs->journal.offset); + my_uring_prep_writev(sqe, bs->journal_fd_index, &data->iov, 1, bs->journal.offset); + sqe->flags |= IOSQE_FIXED_FILE; wait_count++; resume_13: if (wait_count > 0) @@ -485,8 +489,9 @@ bool journal_flusher_co::scan_dirty(int wait_base) data->iov = (struct iovec){ v.back().buf, (size_t)submit_len }; data->callback = simple_callback_r; my_uring_prep_readv( - sqe, bs->journal.fd, &data->iov, 1, bs->journal.offset + submit_offset + sqe, bs->journal_fd_index, &data->iov, 1, bs->journal.offset + submit_offset ); + sqe->flags |= IOSQE_FIXED_FILE; wait_count++; } } @@ -568,8 +573,9 @@ bool journal_flusher_co::modify_meta_read(uint64_t meta_loc, flusher_meta_write_ data->callback = simple_callback_r; wr.submitted = true; my_uring_prep_readv( - sqe, bs->meta_fd, &data->iov, 1, bs->meta_offset + wr.sector + sqe, bs->meta_fd_index, &data->iov, 1, bs->meta_offset + wr.sector ); + sqe->flags |= IOSQE_FIXED_FILE; wait_count++; } else @@ -638,7 +644,8 @@ bool journal_flusher_co::fsync_batch(bool fsync_meta, int wait_base) await_sqe(0); data->iov = { 0 }; data->callback = simple_callback_w; - my_uring_prep_fsync(sqe, fsync_meta ? bs->meta_fd : bs->data_fd, IORING_FSYNC_DATASYNC); + my_uring_prep_fsync(sqe, fsync_meta ? bs->meta_fd_index : bs->data_fd_index, IORING_FSYNC_DATASYNC); + sqe->flags |= IOSQE_FIXED_FILE; cur_sync->state = 1; wait_count++; resume_1: diff --git a/blockstore_impl.h b/blockstore_impl.h index 34ef453d..70cde03e 100644 --- a/blockstore_impl.h +++ b/blockstore_impl.h @@ -216,6 +216,7 @@ class blockstore_impl_t int data_fd; uint64_t meta_size, meta_area, meta_len; uint64_t data_size, data_len; + int meta_fd_index, data_fd_index, journal_fd_index; void *metadata_buffer = NULL; diff --git a/blockstore_init.cpp b/blockstore_init.cpp index 284d1567..61b410fa 100644 --- a/blockstore_init.cpp +++ b/blockstore_init.cpp @@ -55,7 +55,8 @@ int blockstore_init_meta::loop() bs->meta_len - metadata_read > bs->metadata_buf_size ? bs->metadata_buf_size : bs->meta_len - metadata_read, }; data->callback = [this](ring_data_t *data) { handle_event(data); }; - my_uring_prep_readv(sqe, bs->meta_fd, &data->iov, 1, bs->meta_offset + metadata_read); + my_uring_prep_readv(sqe, bs->meta_fd_index, &data->iov, 1, bs->meta_offset + metadata_read); + sqe->flags |= IOSQE_FIXED_FILE; bs->ringloop->submit(); submitted = (prev == 1 ? 2 : 1); prev = submitted; @@ -216,7 +217,8 @@ int blockstore_init_journal::loop() data = ((ring_data_t*)sqe->user_data); data->iov = { submitted_buf, bs->journal.block_size }; data->callback = simple_callback; - my_uring_prep_readv(sqe, bs->journal.fd, &data->iov, 1, bs->journal.offset); + my_uring_prep_readv(sqe, bs->journal_fd_index, &data->iov, 1, bs->journal.offset); + sqe->flags |= IOSQE_FIXED_FILE; bs->ringloop->submit(); wait_count = 1; resume_1: @@ -254,7 +256,8 @@ resume_1: GET_SQE(); data->iov = (struct iovec){ submitted_buf, 2*bs->journal.block_size }; data->callback = simple_callback; - my_uring_prep_writev(sqe, bs->journal.fd, &data->iov, 1, bs->journal.offset); + my_uring_prep_writev(sqe, bs->journal_fd_index, &data->iov, 1, bs->journal.offset); + sqe->flags |= IOSQE_FIXED_FILE; wait_count++; bs->ringloop->submit(); resume_6: @@ -266,7 +269,8 @@ resume_1: if (!bs->disable_journal_fsync) { GET_SQE(); - my_uring_prep_fsync(sqe, bs->journal.fd, IORING_FSYNC_DATASYNC); + my_uring_prep_fsync(sqe, bs->journal_fd_index, IORING_FSYNC_DATASYNC); + sqe->flags |= IOSQE_FIXED_FILE; data->iov = { 0 }; data->callback = simple_callback; wait_count++; @@ -325,7 +329,8 @@ resume_1: end - journal_pos < JOURNAL_BUFFER_SIZE ? end - journal_pos : JOURNAL_BUFFER_SIZE, }; data->callback = [this](ring_data_t *data1) { handle_event(data1); }; - my_uring_prep_readv(sqe, bs->journal.fd, &data->iov, 1, bs->journal.offset + journal_pos); + my_uring_prep_readv(sqe, bs->journal_fd_index, &data->iov, 1, bs->journal.offset + journal_pos); + sqe->flags |= IOSQE_FIXED_FILE; bs->ringloop->submit(); } while (done.size() > 0) @@ -340,7 +345,8 @@ resume_1: GET_SQE(); data->iov = { init_write_buf, bs->journal.block_size }; data->callback = simple_callback; - my_uring_prep_writev(sqe, bs->journal.fd, &data->iov, 1, bs->journal.offset + init_write_sector); + my_uring_prep_writev(sqe, bs->journal_fd_index, &data->iov, 1, bs->journal.offset + init_write_sector); + sqe->flags |= IOSQE_FIXED_FILE; wait_count++; bs->ringloop->submit(); resume_7: @@ -354,7 +360,8 @@ resume_1: GET_SQE(); data->iov = { 0 }; data->callback = simple_callback; - my_uring_prep_fsync(sqe, bs->journal.fd, IORING_FSYNC_DATASYNC); + my_uring_prep_fsync(sqe, bs->journal_fd_index, IORING_FSYNC_DATASYNC); + sqe->flags |= IOSQE_FIXED_FILE; wait_count++; bs->ringloop->submit(); } diff --git a/blockstore_journal.cpp b/blockstore_journal.cpp index 0d095074..b5d5a652 100644 --- a/blockstore_journal.cpp +++ b/blockstore_journal.cpp @@ -126,8 +126,9 @@ void prepare_journal_sector_write(journal_t & journal, int cur_sector, io_uring_ }; data->callback = cb; my_uring_prep_writev( - sqe, journal.fd, &data->iov, 1, journal.offset + journal.sector_info[cur_sector].offset + sqe, journal.fd_index, &data->iov, 1, journal.offset + journal.sector_info[cur_sector].offset ); + sqe->flags |= IOSQE_FIXED_FILE; } journal_t::~journal_t() diff --git a/blockstore_journal.h b/blockstore_journal.h index 5d14985a..27ecbc5b 100644 --- a/blockstore_journal.h +++ b/blockstore_journal.h @@ -130,7 +130,7 @@ struct journal_sector_info_t struct journal_t { - int fd; + int fd, fd_index; uint64_t device_size; bool inmemory = false; void *buffer = NULL; diff --git a/blockstore_open.cpp b/blockstore_open.cpp index 85e3320e..dc172761 100644 --- a/blockstore_open.cpp +++ b/blockstore_open.cpp @@ -140,6 +140,10 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config) void blockstore_impl_t::calc_lengths() { + // register fds + data_fd_index = ringloop->register_fd(data_fd); + meta_fd_index = meta_fd == data_fd ? data_fd_index : ringloop->register_fd(meta_fd); + journal.fd_index = journal_fd_index = journal.fd == meta_fd ? meta_fd_index : ringloop->register_fd(journal.fd); // data data_len = data_size - data_offset; if (data_fd == meta_fd && data_offset < meta_offset) diff --git a/blockstore_read.cpp b/blockstore_read.cpp index 31b06d95..6728b361 100644 --- a/blockstore_read.cpp +++ b/blockstore_read.cpp @@ -31,10 +31,11 @@ int blockstore_impl_t::fulfill_read_push(blockstore_op_t *op, void *buf, uint64_ PRIV(op)->pending_ops++; my_uring_prep_readv( sqe, - IS_JOURNAL(item_state) ? journal.fd : data_fd, + IS_JOURNAL(item_state) ? journal_fd_index : data_fd_index, &data->iov, 1, (IS_JOURNAL(item_state) ? journal.offset : data_offset) + offset ); + sqe->flags |= IOSQE_FIXED_FILE; data->callback = [this, op](ring_data_t *data) { handle_read_event(data, op); }; return 1; } diff --git a/blockstore_sync.cpp b/blockstore_sync.cpp index 2b30b899..da02cffe 100644 --- a/blockstore_sync.cpp +++ b/blockstore_sync.cpp @@ -78,7 +78,8 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op) if (!disable_data_fsync) { BS_SUBMIT_GET_SQE(sqe, data); - my_uring_prep_fsync(sqe, data_fd, IORING_FSYNC_DATASYNC); + my_uring_prep_fsync(sqe, data_fd_index, IORING_FSYNC_DATASYNC); + sqe->flags |= IOSQE_FIXED_FILE; data->iov = { 0 }; data->callback = cb; PRIV(op)->min_used_journal_sector = PRIV(op)->max_used_journal_sector = 0; @@ -161,7 +162,8 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op) if (!disable_journal_fsync) { BS_SUBMIT_GET_SQE(sqe, data); - my_uring_prep_fsync(sqe, journal.fd, IORING_FSYNC_DATASYNC); + my_uring_prep_fsync(sqe, journal_fd_index, IORING_FSYNC_DATASYNC); + sqe->flags |= IOSQE_FIXED_FILE; data->iov = { 0 }; data->callback = cb; PRIV(op)->pending_ops = 1; diff --git a/blockstore_write.cpp b/blockstore_write.cpp index 0270f0b7..a94a0167 100644 --- a/blockstore_write.cpp +++ b/blockstore_write.cpp @@ -122,8 +122,9 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op) data->iov.iov_len = op->len + stripe_offset + stripe_end; // to check it in the callback data->callback = [this, op](ring_data_t *data) { handle_write_event(data, op); }; my_uring_prep_writev( - sqe, data_fd, PRIV(op)->iov_zerofill, vcnt, data_offset + (loc << block_order) + op->offset - stripe_offset + sqe, data_fd_index, PRIV(op)->iov_zerofill, vcnt, data_offset + (loc << block_order) + op->offset - stripe_offset ); + sqe->flags |= IOSQE_FIXED_FILE; PRIV(op)->pending_ops = 1; PRIV(op)->min_used_journal_sector = PRIV(op)->max_used_journal_sector = 0; // Remember big write as unsynced @@ -198,8 +199,9 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op) data2->iov = (struct iovec){ op->buf, op->len }; data2->callback = cb; my_uring_prep_writev( - sqe2, journal.fd, &data2->iov, 1, journal.offset + journal.next_free + sqe2, journal_fd_index, &data2->iov, 1, journal.offset + journal.next_free ); + sqe2->flags |= IOSQE_FIXED_FILE; PRIV(op)->pending_ops++; } else diff --git a/osd.cpp b/osd.cpp index e4b820ef..0e77c9bd 100644 --- a/osd.cpp +++ b/osd.cpp @@ -110,6 +110,7 @@ osd_t::osd_t(blockstore_config_t & config, blockstore_t *bs, ring_loop_t *ringlo close(listen_fd); throw std::runtime_error(std::string("epoll_create: ") + strerror(errno)); } + epoll_fd_index = ringloop->register_fd(epoll_fd); epoll_event ev; ev.data.fd = listen_fd; @@ -186,16 +187,19 @@ void osd_t::handle_epoll_events() throw std::runtime_error("can't get SQE, will fall out of sync with EPOLLET"); } ring_data_t *data = ((ring_data_t*)sqe->user_data); - my_uring_prep_poll_add(sqe, epoll_fd, POLLIN); + data->allow_cancel = true; + my_uring_prep_poll_add(sqe, epoll_fd_index, POLLIN); + sqe->flags |= IOSQE_FIXED_FILE; data->callback = [this](ring_data_t *data) { - if (data->res < 0) + if (data->res < 0 && data->res != -ECANCELED) { throw std::runtime_error(std::string("epoll failed: ") + strerror(-data->res)); } handle_epoll_events(); }; ringloop->submit(); + // FIXME With SQ thread we have no guarantee that epoll request will be submitted right here... int nfds; epoll_event events[MAX_EPOLL_EVENTS]; restart: @@ -219,12 +223,13 @@ restart: .peer_addr = addr, .peer_port = ntohs(addr.sin_port), .peer_fd = peer_fd, + .peer_fd_index = ringloop->register_fd(peer_fd), .peer_state = PEER_CONNECTED, }; // Add FD to epoll epoll_event ev; ev.data.fd = peer_fd; - ev.events = EPOLLIN | EPOLLET | EPOLLRDHUP; + ev.events = EPOLLIN | EPOLLRDHUP | EPOLLET; if (epoll_ctl(epoll_fd, EPOLL_CTL_ADD, peer_fd, &ev) < 0) { throw std::runtime_error(std::string("epoll_ctl: ") + strerror(errno)); @@ -263,7 +268,7 @@ restart: } } } - if (nfds == MAX_EPOLL_EVENTS) + if (nfds > 0) { goto restart; } diff --git a/osd.h b/osd.h index 7784db9f..9d494f22 100644 --- a/osd.h +++ b/osd.h @@ -122,7 +122,7 @@ struct osd_client_t { sockaddr_in peer_addr; int peer_port; - int peer_fd; + int peer_fd, peer_fd_index; int peer_state; std::function connect_callback; osd_num_t osd_num = 0; @@ -196,7 +196,7 @@ class osd_t timerfd_interval *tick_tfd; int wait_state = 0; - int epoll_fd = 0; + int epoll_fd = 0, epoll_fd_index = -1; int listen_fd = 0; ring_consumer_t consumer; diff --git a/osd_peering.cpp b/osd_peering.cpp index 6abaa963..d2153f93 100644 --- a/osd_peering.cpp +++ b/osd_peering.cpp @@ -116,6 +116,7 @@ void osd_t::handle_connect_result(int peer_fd) int one = 1; setsockopt(peer_fd, SOL_TCP, TCP_NODELAY, &one, sizeof(one)); // Disable EPOLLOUT on this fd + cl.peer_fd_index = ringloop->register_fd(peer_fd); cl.connect_callback = NULL; cl.peer_state = PEER_CONNECTED; epoll_event ev; diff --git a/osd_receive.cpp b/osd_receive.cpp index 2de701b7..cfa4ead4 100644 --- a/osd_receive.cpp +++ b/osd_receive.cpp @@ -32,36 +32,38 @@ void osd_t::read_requests() cl.read_msg.msg_iov = &cl.read_iov; cl.read_msg.msg_iovlen = 1; data->callback = [this, peer_fd](ring_data_t *data) { handle_read(data, peer_fd); }; - my_uring_prep_recvmsg(sqe, peer_fd, &cl.read_msg, 0); + my_uring_prep_recvmsg(sqe, cl.peer_fd_index, &cl.read_msg, 0); + sqe->flags |= IOSQE_FIXED_FILE; } read_ready_clients.clear(); } void osd_t::handle_read(ring_data_t *data, int peer_fd) { + int res = data->res; auto cl_it = clients.find(peer_fd); if (cl_it != clients.end()) { auto & cl = cl_it->second; - if (data->res == -EAGAIN) + if (res == -EAGAIN) { cl.read_ready--; if (cl.read_ready > 0) read_ready_clients.push_back(peer_fd); return; } - else if (data->res < 0) + else if (res < 0) { // this is a client socket, so don't panic. just disconnect it - printf("Client %d socket read error: %d (%s). Disconnecting client\n", peer_fd, -data->res, strerror(-data->res)); + printf("Client %d socket read error: %d (%s). Disconnecting client\n", peer_fd, -res, strerror(-res)); stop_client(peer_fd); return; } read_ready_clients.push_back(peer_fd); - if (data->res > 0) + if (res > 0) { - cl.read_remaining -= data->res; - cl.read_buf += data->res; + cl.read_remaining -= res; + cl.read_buf += res; if (cl.read_remaining <= 0) { cl.read_buf = NULL; diff --git a/osd_send.cpp b/osd_send.cpp index 56d79ca2..2b91f842 100644 --- a/osd_send.cpp +++ b/osd_send.cpp @@ -49,7 +49,8 @@ bool osd_t::try_send(osd_client_t & cl) cl.write_msg.msg_iov = cl.write_op->send_list.get_iovec(); cl.write_msg.msg_iovlen = cl.write_op->send_list.get_size(); data->callback = [this, peer_fd](ring_data_t *data) { handle_send(data, peer_fd); }; - my_uring_prep_sendmsg(sqe, peer_fd, &cl.write_msg, 0); + my_uring_prep_sendmsg(sqe, cl.peer_fd_index, &cl.write_msg, 0); + sqe->flags |= IOSQE_FIXED_FILE; return true; } @@ -69,32 +70,33 @@ void osd_t::send_replies() void osd_t::handle_send(ring_data_t *data, int peer_fd) { + int res = data->res; auto cl_it = clients.find(peer_fd); if (cl_it != clients.end()) { auto & cl = cl_it->second; - if (data->res < 0 && data->res != -EAGAIN) + if (res < 0 && res != -EAGAIN) { // this is a client socket, so don't panic. just disconnect it - printf("Client %d socket write error: %d (%s). Disconnecting client\n", peer_fd, -data->res, strerror(-data->res)); + printf("Client %d socket write error: %d (%s). Disconnecting client\n", peer_fd, -res, strerror(-res)); stop_client(peer_fd); return; } - if (data->res >= 0) + if (res >= 0) { osd_op_t *cur_op = cl.write_op; - while (data->res > 0 && cur_op->send_list.sent < cur_op->send_list.count) + while (res > 0 && cur_op->send_list.sent < cur_op->send_list.count) { iovec & iov = cur_op->send_list.buf[cur_op->send_list.sent]; - if (iov.iov_len <= data->res) + if (iov.iov_len <= res) { - data->res -= iov.iov_len; + res -= iov.iov_len; cur_op->send_list.sent++; } else { - iov.iov_len -= data->res; - iov.iov_base += data->res; + iov.iov_len -= res; + iov.iov_base += res; break; } } diff --git a/ringloop.cpp b/ringloop.cpp index 7c1598be..c1989a3f 100644 --- a/ringloop.cpp +++ b/ringloop.cpp @@ -1,14 +1,19 @@ +#include + #include "ringloop.h" ring_loop_t::ring_loop_t(int qd) { - int ret = io_uring_queue_init(qd, &ring, 0); + io_uring_params params = { 0 }; + params.flags = IORING_SETUP_SQPOLL; + params.sq_thread_idle = 10; + int ret = io_uring_queue_init_params(qd, &ring, ¶ms); if (ret < 0) { throw std::runtime_error(std::string("io_uring_queue_init: ") + strerror(-ret)); } - free_ring_data_ptr = *ring.cq.kring_entries; - ring_datas = (struct ring_data_t*)malloc(sizeof(ring_data_t) * free_ring_data_ptr); + ring_data_total = free_ring_data_ptr = *ring.cq.kring_entries; + ring_datas = (ring_data_t*)malloc(sizeof(ring_data_t) * free_ring_data_ptr); free_ring_data = (int*)malloc(sizeof(int) * free_ring_data_ptr); if (!ring_datas || !free_ring_data) { @@ -16,6 +21,7 @@ ring_loop_t::ring_loop_t(int qd) } for (int i = 0; i < free_ring_data_ptr; i++) { + ring_datas[i] = { 0 }; free_ring_data[i] = i; } } @@ -27,6 +33,105 @@ ring_loop_t::~ring_loop_t() io_uring_queue_exit(&ring); } +void ring_loop_t::drain_events(void *completions_ptr) +{ + std::set & completions = *((std::set *)completions_ptr); + if (free_ring_data_ptr < ring_data_total) + { + // Try to cancel requests that are allowed to be canceled by the caller (epoll, timerfd and similar) + for (int i = 0; i < ring_data_total; i++) + { + if (ring_datas[i].allow_cancel) + { + // allow_cancel may only be true while the operation is inflight + io_uring_sqe *sqe = get_sqe(); + if (!sqe) + { + throw std::runtime_error("can't get SQE to cancel operation"); + } + ring_data_t *data = (ring_data_t*)sqe->user_data; + data->callback = NULL; + ring_datas[i].res = -ECANCELED; + my_uring_prep_cancel(sqe, &ring_datas[i], 0); + // It seems (FIXME) cancel operations don't always get completions + completions.insert(data); + } + } + if (completions.size() > 0) + { + submit(); + } + } + int inflight = ring_data_total - free_ring_data_ptr; + while (completions.size() < inflight) + { + io_uring_cqe *cqe; + while (!io_uring_peek_cqe(&ring, &cqe)) + { + ring_data_t *d = (ring_data_t*)cqe->user_data; + d->res = cqe->res; + d->allow_cancel = false; + completions.insert(d); + io_uring_cqe_seen(&ring, cqe); + } + if (completions.size() < inflight) + { + wait(); + } + } +} + +void ring_loop_t::run_completions(void *completions_ptr) +{ + std::set & completions = *((std::set *)completions_ptr); + // Call event callbacks + for (ring_data_t *d: completions) + { + free_ring_data[free_ring_data_ptr++] = d - ring_datas; + if (d->callback) + d->callback(d); + } +} + +int ring_loop_t::register_fd(int fd) +{ + std::set completions; + drain_events((void*)&completions); + // Modify registered files + int idx = reg_fds.size(); + reg_fds.push_back(fd); + if (registered) + { + io_uring_unregister_files(&ring); + } + int ret = io_uring_register_files(&ring, reg_fds.data(), reg_fds.size()); + if (ret != 0) + { + throw std::runtime_error(std::string("io_uring_register_files_update: ") + strerror(-ret)); + } + registered = 1; + run_completions((void*)&completions); + return idx; +} + +void ring_loop_t::unregister_fd(int fd_index) +{ + std::set completions; + drain_events((void*)&completions); + // Modify registered files + reg_fds.erase(reg_fds.begin()+fd_index, reg_fds.begin()+fd_index+1); + if (registered) + { + io_uring_unregister_files(&ring); + } + int ret = io_uring_register_files(&ring, reg_fds.data(), reg_fds.size()); + if (ret != 0) + { + throw std::runtime_error(std::string("io_uring_register_files_update: ") + strerror(-ret)); + } + run_completions((void*)&completions); +} + int ring_loop_t::register_consumer(ring_consumer_t & consumer) { consumer.number = consumers.size(); @@ -50,17 +155,16 @@ void ring_loop_t::unregister_consumer(ring_consumer_t & consumer) void ring_loop_t::loop() { - struct io_uring_cqe *cqe; + io_uring_cqe *cqe; while (!io_uring_peek_cqe(&ring, &cqe)) { - struct ring_data_t *d = (struct ring_data_t*)cqe->user_data; - if (d->callback) - { - d->res = cqe->res; - d->callback(d); - } - free_ring_data[free_ring_data_ptr++] = d - ring_datas; + ring_data_t *d = (ring_data_t*)cqe->user_data; + d->res = cqe->res; + d->allow_cancel = false; io_uring_cqe_seen(&ring, cqe); + free_ring_data[free_ring_data_ptr++] = d - ring_datas; + if (d->callback) + d->callback(d); } do { diff --git a/ringloop.h b/ringloop.h index 4014cfb4..091584cf 100644 --- a/ringloop.h +++ b/ringloop.h @@ -107,6 +107,7 @@ static inline void my_uring_prep_cancel(struct io_uring_sqe *sqe, void *user_dat struct ring_data_t { struct iovec iov; // for single-entry read/write operations + bool allow_cancel; int res; std::function callback; }; @@ -122,22 +123,35 @@ class ring_loop_t std::vector consumers; struct ring_data_t *ring_datas; int *free_ring_data; - unsigned free_ring_data_ptr; + unsigned free_ring_data_ptr, ring_data_total; bool loop_again; struct io_uring ring; + int registered = 0; + std::vector reg_fds; + void drain_events(void *completions_ptr); + void run_completions(void *completions_ptr); + public: ring_loop_t(int qd); ~ring_loop_t(); int register_consumer(ring_consumer_t & consumer); void unregister_consumer(ring_consumer_t & consumer); + int register_fd(int fd); + void unregister_fd(int fd_index); + inline struct io_uring_sqe* get_sqe() { if (free_ring_data_ptr == 0) + { return NULL; + } struct io_uring_sqe* sqe = io_uring_get_sqe(&ring); if (sqe) - io_uring_sqe_set_data(sqe, ring_datas + free_ring_data[--free_ring_data_ptr]); + { + ring_data_t *data = ring_datas + free_ring_data[--free_ring_data_ptr]; + io_uring_sqe_set_data(sqe, data); + } return sqe; } inline int submit() diff --git a/timerfd_interval.cpp b/timerfd_interval.cpp index a7903b5f..cfa600f6 100644 --- a/timerfd_interval.cpp +++ b/timerfd_interval.cpp @@ -21,6 +21,7 @@ timerfd_interval::timerfd_interval(ring_loop_t *ringloop, int seconds, std::func } consumer.loop = [this]() { loop(); }; ringloop->register_consumer(consumer); + timerfd_index = ringloop->register_fd(timerfd); this->ringloop = ringloop; this->callback = cb; } @@ -44,10 +45,12 @@ void timerfd_interval::loop() return; } struct ring_data_t *data = ((ring_data_t*)sqe->user_data); - my_uring_prep_poll_add(sqe, timerfd, POLLIN); + my_uring_prep_poll_add(sqe, timerfd_index, POLLIN); + sqe->flags |= IOSQE_FIXED_FILE; + data->allow_cancel = true; data->callback = [&](ring_data_t *data) { - if (data->res < 0) + if (data->res < 0 && data->res != -ECANCELED) { throw std::runtime_error(std::string("waiting for timer failed: ") + strerror(-data->res)); } diff --git a/timerfd_interval.h b/timerfd_interval.h index 84d9587c..3019f9da 100644 --- a/timerfd_interval.h +++ b/timerfd_interval.h @@ -5,7 +5,7 @@ class timerfd_interval { int wait_state; - int timerfd; + int timerfd, timerfd_index; int status; ring_loop_t *ringloop; ring_consumer_t consumer;