@@ -287,8 +287,9 @@ resume_1: | |||
data->iov = (struct iovec){ it->buf, (size_t)it->len }; | |||
data->callback = simple_callback_w; | |||
my_uring_prep_writev( | |||
sqe, bs->data_fd, &data->iov, 1, bs->data_offset + clean_loc + it->offset | |||
sqe, bs->data_fd_index, &data->iov, 1, bs->data_offset + clean_loc + it->offset | |||
); | |||
sqe->flags |= IOSQE_FIXED_FILE; | |||
wait_count++; | |||
} | |||
// Sync data before writing metadata | |||
@@ -320,8 +321,9 @@ resume_1: | |||
data->iov = (struct iovec){ meta_old.buf, bs->meta_block_size }; | |||
data->callback = simple_callback_w; | |||
my_uring_prep_writev( | |||
sqe, bs->meta_fd, &data->iov, 1, bs->meta_offset + meta_old.sector | |||
sqe, bs->meta_fd_index, &data->iov, 1, bs->meta_offset + meta_old.sector | |||
); | |||
sqe->flags |= IOSQE_FIXED_FILE; | |||
wait_count++; | |||
} | |||
if (has_delete) | |||
@@ -342,8 +344,9 @@ resume_1: | |||
data->iov = (struct iovec){ meta_new.buf, bs->meta_block_size }; | |||
data->callback = simple_callback_w; | |||
my_uring_prep_writev( | |||
sqe, bs->meta_fd, &data->iov, 1, bs->meta_offset + meta_new.sector | |||
sqe, bs->meta_fd_index, &data->iov, 1, bs->meta_offset + meta_new.sector | |||
); | |||
sqe->flags |= IOSQE_FIXED_FILE; | |||
wait_count++; | |||
resume_7: | |||
if (wait_count > 0) | |||
@@ -405,7 +408,8 @@ resume_1: | |||
((journal_entry_start*)flusher->journal_superblock)->crc32 = je_crc32((journal_entry*)flusher->journal_superblock); | |||
data->iov = (struct iovec){ flusher->journal_superblock, bs->journal_block_size }; | |||
data->callback = simple_callback_w; | |||
my_uring_prep_writev(sqe, bs->journal.fd, &data->iov, 1, bs->journal.offset); | |||
my_uring_prep_writev(sqe, bs->journal_fd_index, &data->iov, 1, bs->journal.offset); | |||
sqe->flags |= IOSQE_FIXED_FILE; | |||
wait_count++; | |||
resume_13: | |||
if (wait_count > 0) | |||
@@ -485,8 +489,9 @@ bool journal_flusher_co::scan_dirty(int wait_base) | |||
data->iov = (struct iovec){ v.back().buf, (size_t)submit_len }; | |||
data->callback = simple_callback_r; | |||
my_uring_prep_readv( | |||
sqe, bs->journal.fd, &data->iov, 1, bs->journal.offset + submit_offset | |||
sqe, bs->journal_fd_index, &data->iov, 1, bs->journal.offset + submit_offset | |||
); | |||
sqe->flags |= IOSQE_FIXED_FILE; | |||
wait_count++; | |||
} | |||
} | |||
@@ -568,8 +573,9 @@ bool journal_flusher_co::modify_meta_read(uint64_t meta_loc, flusher_meta_write_ | |||
data->callback = simple_callback_r; | |||
wr.submitted = true; | |||
my_uring_prep_readv( | |||
sqe, bs->meta_fd, &data->iov, 1, bs->meta_offset + wr.sector | |||
sqe, bs->meta_fd_index, &data->iov, 1, bs->meta_offset + wr.sector | |||
); | |||
sqe->flags |= IOSQE_FIXED_FILE; | |||
wait_count++; | |||
} | |||
else | |||
@@ -638,7 +644,8 @@ bool journal_flusher_co::fsync_batch(bool fsync_meta, int wait_base) | |||
await_sqe(0); | |||
data->iov = { 0 }; | |||
data->callback = simple_callback_w; | |||
my_uring_prep_fsync(sqe, fsync_meta ? bs->meta_fd : bs->data_fd, IORING_FSYNC_DATASYNC); | |||
my_uring_prep_fsync(sqe, fsync_meta ? bs->meta_fd_index : bs->data_fd_index, IORING_FSYNC_DATASYNC); | |||
sqe->flags |= IOSQE_FIXED_FILE; | |||
cur_sync->state = 1; | |||
wait_count++; | |||
resume_1: | |||
@@ -216,6 +216,7 @@ class blockstore_impl_t | |||
int data_fd; | |||
uint64_t meta_size, meta_area, meta_len; | |||
uint64_t data_size, data_len; | |||
int meta_fd_index, data_fd_index, journal_fd_index; | |||
void *metadata_buffer = NULL; | |||
@@ -55,7 +55,8 @@ int blockstore_init_meta::loop() | |||
bs->meta_len - metadata_read > bs->metadata_buf_size ? bs->metadata_buf_size : bs->meta_len - metadata_read, | |||
}; | |||
data->callback = [this](ring_data_t *data) { handle_event(data); }; | |||
my_uring_prep_readv(sqe, bs->meta_fd, &data->iov, 1, bs->meta_offset + metadata_read); | |||
my_uring_prep_readv(sqe, bs->meta_fd_index, &data->iov, 1, bs->meta_offset + metadata_read); | |||
sqe->flags |= IOSQE_FIXED_FILE; | |||
bs->ringloop->submit(); | |||
submitted = (prev == 1 ? 2 : 1); | |||
prev = submitted; | |||
@@ -216,7 +217,8 @@ int blockstore_init_journal::loop() | |||
data = ((ring_data_t*)sqe->user_data); | |||
data->iov = { submitted_buf, bs->journal.block_size }; | |||
data->callback = simple_callback; | |||
my_uring_prep_readv(sqe, bs->journal.fd, &data->iov, 1, bs->journal.offset); | |||
my_uring_prep_readv(sqe, bs->journal_fd_index, &data->iov, 1, bs->journal.offset); | |||
sqe->flags |= IOSQE_FIXED_FILE; | |||
bs->ringloop->submit(); | |||
wait_count = 1; | |||
resume_1: | |||
@@ -254,7 +256,8 @@ resume_1: | |||
GET_SQE(); | |||
data->iov = (struct iovec){ submitted_buf, 2*bs->journal.block_size }; | |||
data->callback = simple_callback; | |||
my_uring_prep_writev(sqe, bs->journal.fd, &data->iov, 1, bs->journal.offset); | |||
my_uring_prep_writev(sqe, bs->journal_fd_index, &data->iov, 1, bs->journal.offset); | |||
sqe->flags |= IOSQE_FIXED_FILE; | |||
wait_count++; | |||
bs->ringloop->submit(); | |||
resume_6: | |||
@@ -266,7 +269,8 @@ resume_1: | |||
if (!bs->disable_journal_fsync) | |||
{ | |||
GET_SQE(); | |||
my_uring_prep_fsync(sqe, bs->journal.fd, IORING_FSYNC_DATASYNC); | |||
my_uring_prep_fsync(sqe, bs->journal_fd_index, IORING_FSYNC_DATASYNC); | |||
sqe->flags |= IOSQE_FIXED_FILE; | |||
data->iov = { 0 }; | |||
data->callback = simple_callback; | |||
wait_count++; | |||
@@ -325,7 +329,8 @@ resume_1: | |||
end - journal_pos < JOURNAL_BUFFER_SIZE ? end - journal_pos : JOURNAL_BUFFER_SIZE, | |||
}; | |||
data->callback = [this](ring_data_t *data1) { handle_event(data1); }; | |||
my_uring_prep_readv(sqe, bs->journal.fd, &data->iov, 1, bs->journal.offset + journal_pos); | |||
my_uring_prep_readv(sqe, bs->journal_fd_index, &data->iov, 1, bs->journal.offset + journal_pos); | |||
sqe->flags |= IOSQE_FIXED_FILE; | |||
bs->ringloop->submit(); | |||
} | |||
while (done.size() > 0) | |||
@@ -340,7 +345,8 @@ resume_1: | |||
GET_SQE(); | |||
data->iov = { init_write_buf, bs->journal.block_size }; | |||
data->callback = simple_callback; | |||
my_uring_prep_writev(sqe, bs->journal.fd, &data->iov, 1, bs->journal.offset + init_write_sector); | |||
my_uring_prep_writev(sqe, bs->journal_fd_index, &data->iov, 1, bs->journal.offset + init_write_sector); | |||
sqe->flags |= IOSQE_FIXED_FILE; | |||
wait_count++; | |||
bs->ringloop->submit(); | |||
resume_7: | |||
@@ -354,7 +360,8 @@ resume_1: | |||
GET_SQE(); | |||
data->iov = { 0 }; | |||
data->callback = simple_callback; | |||
my_uring_prep_fsync(sqe, bs->journal.fd, IORING_FSYNC_DATASYNC); | |||
my_uring_prep_fsync(sqe, bs->journal_fd_index, IORING_FSYNC_DATASYNC); | |||
sqe->flags |= IOSQE_FIXED_FILE; | |||
wait_count++; | |||
bs->ringloop->submit(); | |||
} | |||
@@ -126,8 +126,9 @@ void prepare_journal_sector_write(journal_t & journal, int cur_sector, io_uring_ | |||
}; | |||
data->callback = cb; | |||
my_uring_prep_writev( | |||
sqe, journal.fd, &data->iov, 1, journal.offset + journal.sector_info[cur_sector].offset | |||
sqe, journal.fd_index, &data->iov, 1, journal.offset + journal.sector_info[cur_sector].offset | |||
); | |||
sqe->flags |= IOSQE_FIXED_FILE; | |||
} | |||
journal_t::~journal_t() | |||
@@ -130,7 +130,7 @@ struct journal_sector_info_t | |||
struct journal_t | |||
{ | |||
int fd; | |||
int fd, fd_index; | |||
uint64_t device_size; | |||
bool inmemory = false; | |||
void *buffer = NULL; | |||
@@ -140,6 +140,10 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config) | |||
void blockstore_impl_t::calc_lengths() | |||
{ | |||
// register fds | |||
data_fd_index = ringloop->register_fd(data_fd); | |||
meta_fd_index = meta_fd == data_fd ? data_fd_index : ringloop->register_fd(meta_fd); | |||
journal.fd_index = journal_fd_index = journal.fd == meta_fd ? meta_fd_index : ringloop->register_fd(journal.fd); | |||
// data | |||
data_len = data_size - data_offset; | |||
if (data_fd == meta_fd && data_offset < meta_offset) | |||
@@ -31,10 +31,11 @@ int blockstore_impl_t::fulfill_read_push(blockstore_op_t *op, void *buf, uint64_ | |||
PRIV(op)->pending_ops++; | |||
my_uring_prep_readv( | |||
sqe, | |||
IS_JOURNAL(item_state) ? journal.fd : data_fd, | |||
IS_JOURNAL(item_state) ? journal_fd_index : data_fd_index, | |||
&data->iov, 1, | |||
(IS_JOURNAL(item_state) ? journal.offset : data_offset) + offset | |||
); | |||
sqe->flags |= IOSQE_FIXED_FILE; | |||
data->callback = [this, op](ring_data_t *data) { handle_read_event(data, op); }; | |||
return 1; | |||
} | |||
@@ -78,7 +78,8 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op) | |||
if (!disable_data_fsync) | |||
{ | |||
BS_SUBMIT_GET_SQE(sqe, data); | |||
my_uring_prep_fsync(sqe, data_fd, IORING_FSYNC_DATASYNC); | |||
my_uring_prep_fsync(sqe, data_fd_index, IORING_FSYNC_DATASYNC); | |||
sqe->flags |= IOSQE_FIXED_FILE; | |||
data->iov = { 0 }; | |||
data->callback = cb; | |||
PRIV(op)->min_used_journal_sector = PRIV(op)->max_used_journal_sector = 0; | |||
@@ -161,7 +162,8 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op) | |||
if (!disable_journal_fsync) | |||
{ | |||
BS_SUBMIT_GET_SQE(sqe, data); | |||
my_uring_prep_fsync(sqe, journal.fd, IORING_FSYNC_DATASYNC); | |||
my_uring_prep_fsync(sqe, journal_fd_index, IORING_FSYNC_DATASYNC); | |||
sqe->flags |= IOSQE_FIXED_FILE; | |||
data->iov = { 0 }; | |||
data->callback = cb; | |||
PRIV(op)->pending_ops = 1; | |||
@@ -122,8 +122,9 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op) | |||
data->iov.iov_len = op->len + stripe_offset + stripe_end; // to check it in the callback | |||
data->callback = [this, op](ring_data_t *data) { handle_write_event(data, op); }; | |||
my_uring_prep_writev( | |||
sqe, data_fd, PRIV(op)->iov_zerofill, vcnt, data_offset + (loc << block_order) + op->offset - stripe_offset | |||
sqe, data_fd_index, PRIV(op)->iov_zerofill, vcnt, data_offset + (loc << block_order) + op->offset - stripe_offset | |||
); | |||
sqe->flags |= IOSQE_FIXED_FILE; | |||
PRIV(op)->pending_ops = 1; | |||
PRIV(op)->min_used_journal_sector = PRIV(op)->max_used_journal_sector = 0; | |||
// Remember big write as unsynced | |||
@@ -198,8 +199,9 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op) | |||
data2->iov = (struct iovec){ op->buf, op->len }; | |||
data2->callback = cb; | |||
my_uring_prep_writev( | |||
sqe2, journal.fd, &data2->iov, 1, journal.offset + journal.next_free | |||
sqe2, journal_fd_index, &data2->iov, 1, journal.offset + journal.next_free | |||
); | |||
sqe2->flags |= IOSQE_FIXED_FILE; | |||
PRIV(op)->pending_ops++; | |||
} | |||
else | |||
@@ -110,6 +110,7 @@ osd_t::osd_t(blockstore_config_t & config, blockstore_t *bs, ring_loop_t *ringlo | |||
close(listen_fd); | |||
throw std::runtime_error(std::string("epoll_create: ") + strerror(errno)); | |||
} | |||
epoll_fd_index = ringloop->register_fd(epoll_fd); | |||
epoll_event ev; | |||
ev.data.fd = listen_fd; | |||
@@ -186,16 +187,19 @@ void osd_t::handle_epoll_events() | |||
throw std::runtime_error("can't get SQE, will fall out of sync with EPOLLET"); | |||
} | |||
ring_data_t *data = ((ring_data_t*)sqe->user_data); | |||
my_uring_prep_poll_add(sqe, epoll_fd, POLLIN); | |||
data->allow_cancel = true; | |||
my_uring_prep_poll_add(sqe, epoll_fd_index, POLLIN); | |||
sqe->flags |= IOSQE_FIXED_FILE; | |||
data->callback = [this](ring_data_t *data) | |||
{ | |||
if (data->res < 0) | |||
if (data->res < 0 && data->res != -ECANCELED) | |||
{ | |||
throw std::runtime_error(std::string("epoll failed: ") + strerror(-data->res)); | |||
} | |||
handle_epoll_events(); | |||
}; | |||
ringloop->submit(); | |||
// FIXME With SQ thread we have no guarantee that epoll request will be submitted right here... | |||
int nfds; | |||
epoll_event events[MAX_EPOLL_EVENTS]; | |||
restart: | |||
@@ -219,12 +223,13 @@ restart: | |||
.peer_addr = addr, | |||
.peer_port = ntohs(addr.sin_port), | |||
.peer_fd = peer_fd, | |||
.peer_fd_index = ringloop->register_fd(peer_fd), | |||
.peer_state = PEER_CONNECTED, | |||
}; | |||
// Add FD to epoll | |||
epoll_event ev; | |||
ev.data.fd = peer_fd; | |||
ev.events = EPOLLIN | EPOLLET | EPOLLRDHUP; | |||
ev.events = EPOLLIN | EPOLLRDHUP | EPOLLET; | |||
if (epoll_ctl(epoll_fd, EPOLL_CTL_ADD, peer_fd, &ev) < 0) | |||
{ | |||
throw std::runtime_error(std::string("epoll_ctl: ") + strerror(errno)); | |||
@@ -263,7 +268,7 @@ restart: | |||
} | |||
} | |||
} | |||
if (nfds == MAX_EPOLL_EVENTS) | |||
if (nfds > 0) | |||
{ | |||
goto restart; | |||
} | |||
@@ -122,7 +122,7 @@ struct osd_client_t | |||
{ | |||
sockaddr_in peer_addr; | |||
int peer_port; | |||
int peer_fd; | |||
int peer_fd, peer_fd_index; | |||
int peer_state; | |||
std::function<void(osd_num_t, int)> connect_callback; | |||
osd_num_t osd_num = 0; | |||
@@ -196,7 +196,7 @@ class osd_t | |||
timerfd_interval *tick_tfd; | |||
int wait_state = 0; | |||
int epoll_fd = 0; | |||
int epoll_fd = 0, epoll_fd_index = -1; | |||
int listen_fd = 0; | |||
ring_consumer_t consumer; | |||
@@ -116,6 +116,7 @@ void osd_t::handle_connect_result(int peer_fd) | |||
int one = 1; | |||
setsockopt(peer_fd, SOL_TCP, TCP_NODELAY, &one, sizeof(one)); | |||
// Disable EPOLLOUT on this fd | |||
cl.peer_fd_index = ringloop->register_fd(peer_fd); | |||
cl.connect_callback = NULL; | |||
cl.peer_state = PEER_CONNECTED; | |||
epoll_event ev; | |||
@@ -32,36 +32,38 @@ void osd_t::read_requests() | |||
cl.read_msg.msg_iov = &cl.read_iov; | |||
cl.read_msg.msg_iovlen = 1; | |||
data->callback = [this, peer_fd](ring_data_t *data) { handle_read(data, peer_fd); }; | |||
my_uring_prep_recvmsg(sqe, peer_fd, &cl.read_msg, 0); | |||
my_uring_prep_recvmsg(sqe, cl.peer_fd_index, &cl.read_msg, 0); | |||
sqe->flags |= IOSQE_FIXED_FILE; | |||
} | |||
read_ready_clients.clear(); | |||
} | |||
void osd_t::handle_read(ring_data_t *data, int peer_fd) | |||
{ | |||
int res = data->res; | |||
auto cl_it = clients.find(peer_fd); | |||
if (cl_it != clients.end()) | |||
{ | |||
auto & cl = cl_it->second; | |||
if (data->res == -EAGAIN) | |||
if (res == -EAGAIN) | |||
{ | |||
cl.read_ready--; | |||
if (cl.read_ready > 0) | |||
read_ready_clients.push_back(peer_fd); | |||
return; | |||
} | |||
else if (data->res < 0) | |||
else if (res < 0) | |||
{ | |||
// this is a client socket, so don't panic. just disconnect it | |||
printf("Client %d socket read error: %d (%s). Disconnecting client\n", peer_fd, -data->res, strerror(-data->res)); | |||
printf("Client %d socket read error: %d (%s). Disconnecting client\n", peer_fd, -res, strerror(-res)); | |||
stop_client(peer_fd); | |||
return; | |||
} | |||
read_ready_clients.push_back(peer_fd); | |||
if (data->res > 0) | |||
if (res > 0) | |||
{ | |||
cl.read_remaining -= data->res; | |||
cl.read_buf += data->res; | |||
cl.read_remaining -= res; | |||
cl.read_buf += res; | |||
if (cl.read_remaining <= 0) | |||
{ | |||
cl.read_buf = NULL; | |||
@@ -49,7 +49,8 @@ bool osd_t::try_send(osd_client_t & cl) | |||
cl.write_msg.msg_iov = cl.write_op->send_list.get_iovec(); | |||
cl.write_msg.msg_iovlen = cl.write_op->send_list.get_size(); | |||
data->callback = [this, peer_fd](ring_data_t *data) { handle_send(data, peer_fd); }; | |||
my_uring_prep_sendmsg(sqe, peer_fd, &cl.write_msg, 0); | |||
my_uring_prep_sendmsg(sqe, cl.peer_fd_index, &cl.write_msg, 0); | |||
sqe->flags |= IOSQE_FIXED_FILE; | |||
return true; | |||
} | |||
@@ -69,32 +70,33 @@ void osd_t::send_replies() | |||
void osd_t::handle_send(ring_data_t *data, int peer_fd) | |||
{ | |||
int res = data->res; | |||
auto cl_it = clients.find(peer_fd); | |||
if (cl_it != clients.end()) | |||
{ | |||
auto & cl = cl_it->second; | |||
if (data->res < 0 && data->res != -EAGAIN) | |||
if (res < 0 && res != -EAGAIN) | |||
{ | |||
// this is a client socket, so don't panic. just disconnect it | |||
printf("Client %d socket write error: %d (%s). Disconnecting client\n", peer_fd, -data->res, strerror(-data->res)); | |||
printf("Client %d socket write error: %d (%s). Disconnecting client\n", peer_fd, -res, strerror(-res)); | |||
stop_client(peer_fd); | |||
return; | |||
} | |||
if (data->res >= 0) | |||
if (res >= 0) | |||
{ | |||
osd_op_t *cur_op = cl.write_op; | |||
while (data->res > 0 && cur_op->send_list.sent < cur_op->send_list.count) | |||
while (res > 0 && cur_op->send_list.sent < cur_op->send_list.count) | |||
{ | |||
iovec & iov = cur_op->send_list.buf[cur_op->send_list.sent]; | |||
if (iov.iov_len <= data->res) | |||
if (iov.iov_len <= res) | |||
{ | |||
data->res -= iov.iov_len; | |||
res -= iov.iov_len; | |||
cur_op->send_list.sent++; | |||
} | |||
else | |||
{ | |||
iov.iov_len -= data->res; | |||
iov.iov_base += data->res; | |||
iov.iov_len -= res; | |||
iov.iov_base += res; | |||
break; | |||
} | |||
} | |||
@@ -1,14 +1,19 @@ | |||
#include <set> | |||
#include "ringloop.h" | |||
ring_loop_t::ring_loop_t(int qd) | |||
{ | |||
int ret = io_uring_queue_init(qd, &ring, 0); | |||
io_uring_params params = { 0 }; | |||
params.flags = IORING_SETUP_SQPOLL; | |||
params.sq_thread_idle = 10; | |||
int ret = io_uring_queue_init_params(qd, &ring, ¶ms); | |||
if (ret < 0) | |||
{ | |||
throw std::runtime_error(std::string("io_uring_queue_init: ") + strerror(-ret)); | |||
} | |||
free_ring_data_ptr = *ring.cq.kring_entries; | |||
ring_datas = (struct ring_data_t*)malloc(sizeof(ring_data_t) * free_ring_data_ptr); | |||
ring_data_total = free_ring_data_ptr = *ring.cq.kring_entries; | |||
ring_datas = (ring_data_t*)malloc(sizeof(ring_data_t) * free_ring_data_ptr); | |||
free_ring_data = (int*)malloc(sizeof(int) * free_ring_data_ptr); | |||
if (!ring_datas || !free_ring_data) | |||
{ | |||
@@ -16,6 +21,7 @@ ring_loop_t::ring_loop_t(int qd) | |||
} | |||
for (int i = 0; i < free_ring_data_ptr; i++) | |||
{ | |||
ring_datas[i] = { 0 }; | |||
free_ring_data[i] = i; | |||
} | |||
} | |||
@@ -27,6 +33,105 @@ ring_loop_t::~ring_loop_t() | |||
io_uring_queue_exit(&ring); | |||
} | |||
void ring_loop_t::drain_events(void *completions_ptr) | |||
{ | |||
std::set<ring_data_t*> & completions = *((std::set<ring_data_t*> *)completions_ptr); | |||
if (free_ring_data_ptr < ring_data_total) | |||
{ | |||
// Try to cancel requests that are allowed to be canceled by the caller (epoll, timerfd and similar) | |||
for (int i = 0; i < ring_data_total; i++) | |||
{ | |||
if (ring_datas[i].allow_cancel) | |||
{ | |||
// allow_cancel may only be true while the operation is inflight | |||
io_uring_sqe *sqe = get_sqe(); | |||
if (!sqe) | |||
{ | |||
throw std::runtime_error("can't get SQE to cancel operation"); | |||
} | |||
ring_data_t *data = (ring_data_t*)sqe->user_data; | |||
data->callback = NULL; | |||
ring_datas[i].res = -ECANCELED; | |||
my_uring_prep_cancel(sqe, &ring_datas[i], 0); | |||
// It seems (FIXME) cancel operations don't always get completions | |||
completions.insert(data); | |||
} | |||
} | |||
if (completions.size() > 0) | |||
{ | |||
submit(); | |||
} | |||
} | |||
int inflight = ring_data_total - free_ring_data_ptr; | |||
while (completions.size() < inflight) | |||
{ | |||
io_uring_cqe *cqe; | |||
while (!io_uring_peek_cqe(&ring, &cqe)) | |||
{ | |||
ring_data_t *d = (ring_data_t*)cqe->user_data; | |||
d->res = cqe->res; | |||
d->allow_cancel = false; | |||
completions.insert(d); | |||
io_uring_cqe_seen(&ring, cqe); | |||
} | |||
if (completions.size() < inflight) | |||
{ | |||
wait(); | |||
} | |||
} | |||
} | |||
void ring_loop_t::run_completions(void *completions_ptr) | |||
{ | |||
std::set<ring_data_t*> & completions = *((std::set<ring_data_t*> *)completions_ptr); | |||
// Call event callbacks | |||
for (ring_data_t *d: completions) | |||
{ | |||
free_ring_data[free_ring_data_ptr++] = d - ring_datas; | |||
if (d->callback) | |||
d->callback(d); | |||
} | |||
} | |||
int ring_loop_t::register_fd(int fd) | |||
{ | |||
std::set<ring_data_t*> completions; | |||
drain_events((void*)&completions); | |||
// Modify registered files | |||
int idx = reg_fds.size(); | |||
reg_fds.push_back(fd); | |||
if (registered) | |||
{ | |||
io_uring_unregister_files(&ring); | |||
} | |||
int ret = io_uring_register_files(&ring, reg_fds.data(), reg_fds.size()); | |||
if (ret != 0) | |||
{ | |||
throw std::runtime_error(std::string("io_uring_register_files_update: ") + strerror(-ret)); | |||
} | |||
registered = 1; | |||
run_completions((void*)&completions); | |||
return idx; | |||
} | |||
void ring_loop_t::unregister_fd(int fd_index) | |||
{ | |||
std::set<ring_data_t*> completions; | |||
drain_events((void*)&completions); | |||
// Modify registered files | |||
reg_fds.erase(reg_fds.begin()+fd_index, reg_fds.begin()+fd_index+1); | |||
if (registered) | |||
{ | |||
io_uring_unregister_files(&ring); | |||
} | |||
int ret = io_uring_register_files(&ring, reg_fds.data(), reg_fds.size()); | |||
if (ret != 0) | |||
{ | |||
throw std::runtime_error(std::string("io_uring_register_files_update: ") + strerror(-ret)); | |||
} | |||
run_completions((void*)&completions); | |||
} | |||
int ring_loop_t::register_consumer(ring_consumer_t & consumer) | |||
{ | |||
consumer.number = consumers.size(); | |||
@@ -50,17 +155,16 @@ void ring_loop_t::unregister_consumer(ring_consumer_t & consumer) | |||
void ring_loop_t::loop() | |||
{ | |||
struct io_uring_cqe *cqe; | |||
io_uring_cqe *cqe; | |||
while (!io_uring_peek_cqe(&ring, &cqe)) | |||
{ | |||
struct ring_data_t *d = (struct ring_data_t*)cqe->user_data; | |||
ring_data_t *d = (ring_data_t*)cqe->user_data; | |||
d->res = cqe->res; | |||
d->allow_cancel = false; | |||
io_uring_cqe_seen(&ring, cqe); | |||
free_ring_data[free_ring_data_ptr++] = d - ring_datas; | |||
if (d->callback) | |||
{ | |||
d->res = cqe->res; | |||
d->callback(d); | |||
} | |||
free_ring_data[free_ring_data_ptr++] = d - ring_datas; | |||
io_uring_cqe_seen(&ring, cqe); | |||
} | |||
do | |||
{ | |||
@@ -107,6 +107,7 @@ static inline void my_uring_prep_cancel(struct io_uring_sqe *sqe, void *user_dat | |||
struct ring_data_t | |||
{ | |||
struct iovec iov; // for single-entry read/write operations | |||
bool allow_cancel; | |||
int res; | |||
std::function<void(ring_data_t*)> callback; | |||
}; | |||
@@ -122,22 +123,35 @@ class ring_loop_t | |||
std::vector<ring_consumer_t> consumers; | |||
struct ring_data_t *ring_datas; | |||
int *free_ring_data; | |||
unsigned free_ring_data_ptr; | |||
unsigned free_ring_data_ptr, ring_data_total; | |||
bool loop_again; | |||
struct io_uring ring; | |||
int registered = 0; | |||
std::vector<int> reg_fds; | |||
void drain_events(void *completions_ptr); | |||
void run_completions(void *completions_ptr); | |||
public: | |||
ring_loop_t(int qd); | |||
~ring_loop_t(); | |||
int register_consumer(ring_consumer_t & consumer); | |||
void unregister_consumer(ring_consumer_t & consumer); | |||
int register_fd(int fd); | |||
void unregister_fd(int fd_index); | |||
inline struct io_uring_sqe* get_sqe() | |||
{ | |||
if (free_ring_data_ptr == 0) | |||
{ | |||
return NULL; | |||
} | |||
struct io_uring_sqe* sqe = io_uring_get_sqe(&ring); | |||
if (sqe) | |||
io_uring_sqe_set_data(sqe, ring_datas + free_ring_data[--free_ring_data_ptr]); | |||
{ | |||
ring_data_t *data = ring_datas + free_ring_data[--free_ring_data_ptr]; | |||
io_uring_sqe_set_data(sqe, data); | |||
} | |||
return sqe; | |||
} | |||
inline int submit() | |||
@@ -21,6 +21,7 @@ timerfd_interval::timerfd_interval(ring_loop_t *ringloop, int seconds, std::func | |||
} | |||
consumer.loop = [this]() { loop(); }; | |||
ringloop->register_consumer(consumer); | |||
timerfd_index = ringloop->register_fd(timerfd); | |||
this->ringloop = ringloop; | |||
this->callback = cb; | |||
} | |||
@@ -44,10 +45,12 @@ void timerfd_interval::loop() | |||
return; | |||
} | |||
struct ring_data_t *data = ((ring_data_t*)sqe->user_data); | |||
my_uring_prep_poll_add(sqe, timerfd, POLLIN); | |||
my_uring_prep_poll_add(sqe, timerfd_index, POLLIN); | |||
sqe->flags |= IOSQE_FIXED_FILE; | |||
data->allow_cancel = true; | |||
data->callback = [&](ring_data_t *data) | |||
{ | |||
if (data->res < 0) | |||
if (data->res < 0 && data->res != -ECANCELED) | |||
{ | |||
throw std::runtime_error(std::string("waiting for timer failed: ") + strerror(-data->res)); | |||
} | |||
@@ -5,7 +5,7 @@ | |||
class timerfd_interval | |||
{ | |||
int wait_state; | |||
int timerfd; | |||
int timerfd, timerfd_index; | |||
int status; | |||
ring_loop_t *ringloop; | |||
ring_consumer_t consumer; | |||