Test SQ poll threads. Unstable and in fact slower :(

test-sq-poll
Vitaliy Filippov 2020-03-03 17:22:40 +03:00
parent c9f3654905
commit eabfe4faac
18 changed files with 215 additions and 59 deletions

View File

@ -287,8 +287,9 @@ resume_1:
data->iov = (struct iovec){ it->buf, (size_t)it->len }; data->iov = (struct iovec){ it->buf, (size_t)it->len };
data->callback = simple_callback_w; data->callback = simple_callback_w;
my_uring_prep_writev( my_uring_prep_writev(
sqe, bs->data_fd, &data->iov, 1, bs->data_offset + clean_loc + it->offset sqe, bs->data_fd_index, &data->iov, 1, bs->data_offset + clean_loc + it->offset
); );
sqe->flags |= IOSQE_FIXED_FILE;
wait_count++; wait_count++;
} }
// Sync data before writing metadata // Sync data before writing metadata
@ -320,8 +321,9 @@ resume_1:
data->iov = (struct iovec){ meta_old.buf, bs->meta_block_size }; data->iov = (struct iovec){ meta_old.buf, bs->meta_block_size };
data->callback = simple_callback_w; data->callback = simple_callback_w;
my_uring_prep_writev( my_uring_prep_writev(
sqe, bs->meta_fd, &data->iov, 1, bs->meta_offset + meta_old.sector sqe, bs->meta_fd_index, &data->iov, 1, bs->meta_offset + meta_old.sector
); );
sqe->flags |= IOSQE_FIXED_FILE;
wait_count++; wait_count++;
} }
if (has_delete) if (has_delete)
@ -342,8 +344,9 @@ resume_1:
data->iov = (struct iovec){ meta_new.buf, bs->meta_block_size }; data->iov = (struct iovec){ meta_new.buf, bs->meta_block_size };
data->callback = simple_callback_w; data->callback = simple_callback_w;
my_uring_prep_writev( my_uring_prep_writev(
sqe, bs->meta_fd, &data->iov, 1, bs->meta_offset + meta_new.sector sqe, bs->meta_fd_index, &data->iov, 1, bs->meta_offset + meta_new.sector
); );
sqe->flags |= IOSQE_FIXED_FILE;
wait_count++; wait_count++;
resume_7: resume_7:
if (wait_count > 0) if (wait_count > 0)
@ -405,7 +408,8 @@ resume_1:
((journal_entry_start*)flusher->journal_superblock)->crc32 = je_crc32((journal_entry*)flusher->journal_superblock); ((journal_entry_start*)flusher->journal_superblock)->crc32 = je_crc32((journal_entry*)flusher->journal_superblock);
data->iov = (struct iovec){ flusher->journal_superblock, bs->journal_block_size }; data->iov = (struct iovec){ flusher->journal_superblock, bs->journal_block_size };
data->callback = simple_callback_w; data->callback = simple_callback_w;
my_uring_prep_writev(sqe, bs->journal.fd, &data->iov, 1, bs->journal.offset); my_uring_prep_writev(sqe, bs->journal_fd_index, &data->iov, 1, bs->journal.offset);
sqe->flags |= IOSQE_FIXED_FILE;
wait_count++; wait_count++;
resume_13: resume_13:
if (wait_count > 0) if (wait_count > 0)
@ -485,8 +489,9 @@ bool journal_flusher_co::scan_dirty(int wait_base)
data->iov = (struct iovec){ v.back().buf, (size_t)submit_len }; data->iov = (struct iovec){ v.back().buf, (size_t)submit_len };
data->callback = simple_callback_r; data->callback = simple_callback_r;
my_uring_prep_readv( my_uring_prep_readv(
sqe, bs->journal.fd, &data->iov, 1, bs->journal.offset + submit_offset sqe, bs->journal_fd_index, &data->iov, 1, bs->journal.offset + submit_offset
); );
sqe->flags |= IOSQE_FIXED_FILE;
wait_count++; wait_count++;
} }
} }
@ -568,8 +573,9 @@ bool journal_flusher_co::modify_meta_read(uint64_t meta_loc, flusher_meta_write_
data->callback = simple_callback_r; data->callback = simple_callback_r;
wr.submitted = true; wr.submitted = true;
my_uring_prep_readv( my_uring_prep_readv(
sqe, bs->meta_fd, &data->iov, 1, bs->meta_offset + wr.sector sqe, bs->meta_fd_index, &data->iov, 1, bs->meta_offset + wr.sector
); );
sqe->flags |= IOSQE_FIXED_FILE;
wait_count++; wait_count++;
} }
else else
@ -638,7 +644,8 @@ bool journal_flusher_co::fsync_batch(bool fsync_meta, int wait_base)
await_sqe(0); await_sqe(0);
data->iov = { 0 }; data->iov = { 0 };
data->callback = simple_callback_w; data->callback = simple_callback_w;
my_uring_prep_fsync(sqe, fsync_meta ? bs->meta_fd : bs->data_fd, IORING_FSYNC_DATASYNC); my_uring_prep_fsync(sqe, fsync_meta ? bs->meta_fd_index : bs->data_fd_index, IORING_FSYNC_DATASYNC);
sqe->flags |= IOSQE_FIXED_FILE;
cur_sync->state = 1; cur_sync->state = 1;
wait_count++; wait_count++;
resume_1: resume_1:

View File

@ -216,6 +216,7 @@ class blockstore_impl_t
int data_fd; int data_fd;
uint64_t meta_size, meta_area, meta_len; uint64_t meta_size, meta_area, meta_len;
uint64_t data_size, data_len; uint64_t data_size, data_len;
int meta_fd_index, data_fd_index, journal_fd_index;
void *metadata_buffer = NULL; void *metadata_buffer = NULL;

View File

@ -55,7 +55,8 @@ int blockstore_init_meta::loop()
bs->meta_len - metadata_read > bs->metadata_buf_size ? bs->metadata_buf_size : bs->meta_len - metadata_read, bs->meta_len - metadata_read > bs->metadata_buf_size ? bs->metadata_buf_size : bs->meta_len - metadata_read,
}; };
data->callback = [this](ring_data_t *data) { handle_event(data); }; data->callback = [this](ring_data_t *data) { handle_event(data); };
my_uring_prep_readv(sqe, bs->meta_fd, &data->iov, 1, bs->meta_offset + metadata_read); my_uring_prep_readv(sqe, bs->meta_fd_index, &data->iov, 1, bs->meta_offset + metadata_read);
sqe->flags |= IOSQE_FIXED_FILE;
bs->ringloop->submit(); bs->ringloop->submit();
submitted = (prev == 1 ? 2 : 1); submitted = (prev == 1 ? 2 : 1);
prev = submitted; prev = submitted;
@ -216,7 +217,8 @@ int blockstore_init_journal::loop()
data = ((ring_data_t*)sqe->user_data); data = ((ring_data_t*)sqe->user_data);
data->iov = { submitted_buf, bs->journal.block_size }; data->iov = { submitted_buf, bs->journal.block_size };
data->callback = simple_callback; data->callback = simple_callback;
my_uring_prep_readv(sqe, bs->journal.fd, &data->iov, 1, bs->journal.offset); my_uring_prep_readv(sqe, bs->journal_fd_index, &data->iov, 1, bs->journal.offset);
sqe->flags |= IOSQE_FIXED_FILE;
bs->ringloop->submit(); bs->ringloop->submit();
wait_count = 1; wait_count = 1;
resume_1: resume_1:
@ -254,7 +256,8 @@ resume_1:
GET_SQE(); GET_SQE();
data->iov = (struct iovec){ submitted_buf, 2*bs->journal.block_size }; data->iov = (struct iovec){ submitted_buf, 2*bs->journal.block_size };
data->callback = simple_callback; data->callback = simple_callback;
my_uring_prep_writev(sqe, bs->journal.fd, &data->iov, 1, bs->journal.offset); my_uring_prep_writev(sqe, bs->journal_fd_index, &data->iov, 1, bs->journal.offset);
sqe->flags |= IOSQE_FIXED_FILE;
wait_count++; wait_count++;
bs->ringloop->submit(); bs->ringloop->submit();
resume_6: resume_6:
@ -266,7 +269,8 @@ resume_1:
if (!bs->disable_journal_fsync) if (!bs->disable_journal_fsync)
{ {
GET_SQE(); GET_SQE();
my_uring_prep_fsync(sqe, bs->journal.fd, IORING_FSYNC_DATASYNC); my_uring_prep_fsync(sqe, bs->journal_fd_index, IORING_FSYNC_DATASYNC);
sqe->flags |= IOSQE_FIXED_FILE;
data->iov = { 0 }; data->iov = { 0 };
data->callback = simple_callback; data->callback = simple_callback;
wait_count++; wait_count++;
@ -325,7 +329,8 @@ resume_1:
end - journal_pos < JOURNAL_BUFFER_SIZE ? end - journal_pos : JOURNAL_BUFFER_SIZE, end - journal_pos < JOURNAL_BUFFER_SIZE ? end - journal_pos : JOURNAL_BUFFER_SIZE,
}; };
data->callback = [this](ring_data_t *data1) { handle_event(data1); }; data->callback = [this](ring_data_t *data1) { handle_event(data1); };
my_uring_prep_readv(sqe, bs->journal.fd, &data->iov, 1, bs->journal.offset + journal_pos); my_uring_prep_readv(sqe, bs->journal_fd_index, &data->iov, 1, bs->journal.offset + journal_pos);
sqe->flags |= IOSQE_FIXED_FILE;
bs->ringloop->submit(); bs->ringloop->submit();
} }
while (done.size() > 0) while (done.size() > 0)
@ -340,7 +345,8 @@ resume_1:
GET_SQE(); GET_SQE();
data->iov = { init_write_buf, bs->journal.block_size }; data->iov = { init_write_buf, bs->journal.block_size };
data->callback = simple_callback; data->callback = simple_callback;
my_uring_prep_writev(sqe, bs->journal.fd, &data->iov, 1, bs->journal.offset + init_write_sector); my_uring_prep_writev(sqe, bs->journal_fd_index, &data->iov, 1, bs->journal.offset + init_write_sector);
sqe->flags |= IOSQE_FIXED_FILE;
wait_count++; wait_count++;
bs->ringloop->submit(); bs->ringloop->submit();
resume_7: resume_7:
@ -354,7 +360,8 @@ resume_1:
GET_SQE(); GET_SQE();
data->iov = { 0 }; data->iov = { 0 };
data->callback = simple_callback; data->callback = simple_callback;
my_uring_prep_fsync(sqe, bs->journal.fd, IORING_FSYNC_DATASYNC); my_uring_prep_fsync(sqe, bs->journal_fd_index, IORING_FSYNC_DATASYNC);
sqe->flags |= IOSQE_FIXED_FILE;
wait_count++; wait_count++;
bs->ringloop->submit(); bs->ringloop->submit();
} }

View File

@ -126,8 +126,9 @@ void prepare_journal_sector_write(journal_t & journal, int cur_sector, io_uring_
}; };
data->callback = cb; data->callback = cb;
my_uring_prep_writev( my_uring_prep_writev(
sqe, journal.fd, &data->iov, 1, journal.offset + journal.sector_info[cur_sector].offset sqe, journal.fd_index, &data->iov, 1, journal.offset + journal.sector_info[cur_sector].offset
); );
sqe->flags |= IOSQE_FIXED_FILE;
} }
journal_t::~journal_t() journal_t::~journal_t()

View File

@ -130,7 +130,7 @@ struct journal_sector_info_t
struct journal_t struct journal_t
{ {
int fd; int fd, fd_index;
uint64_t device_size; uint64_t device_size;
bool inmemory = false; bool inmemory = false;
void *buffer = NULL; void *buffer = NULL;

View File

@ -140,6 +140,10 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config)
void blockstore_impl_t::calc_lengths() void blockstore_impl_t::calc_lengths()
{ {
// register fds
data_fd_index = ringloop->register_fd(data_fd);
meta_fd_index = meta_fd == data_fd ? data_fd_index : ringloop->register_fd(meta_fd);
journal.fd_index = journal_fd_index = journal.fd == meta_fd ? meta_fd_index : ringloop->register_fd(journal.fd);
// data // data
data_len = data_size - data_offset; data_len = data_size - data_offset;
if (data_fd == meta_fd && data_offset < meta_offset) if (data_fd == meta_fd && data_offset < meta_offset)

View File

@ -31,10 +31,11 @@ int blockstore_impl_t::fulfill_read_push(blockstore_op_t *op, void *buf, uint64_
PRIV(op)->pending_ops++; PRIV(op)->pending_ops++;
my_uring_prep_readv( my_uring_prep_readv(
sqe, sqe,
IS_JOURNAL(item_state) ? journal.fd : data_fd, IS_JOURNAL(item_state) ? journal_fd_index : data_fd_index,
&data->iov, 1, &data->iov, 1,
(IS_JOURNAL(item_state) ? journal.offset : data_offset) + offset (IS_JOURNAL(item_state) ? journal.offset : data_offset) + offset
); );
sqe->flags |= IOSQE_FIXED_FILE;
data->callback = [this, op](ring_data_t *data) { handle_read_event(data, op); }; data->callback = [this, op](ring_data_t *data) { handle_read_event(data, op); };
return 1; return 1;
} }

View File

@ -78,7 +78,8 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op)
if (!disable_data_fsync) if (!disable_data_fsync)
{ {
BS_SUBMIT_GET_SQE(sqe, data); BS_SUBMIT_GET_SQE(sqe, data);
my_uring_prep_fsync(sqe, data_fd, IORING_FSYNC_DATASYNC); my_uring_prep_fsync(sqe, data_fd_index, IORING_FSYNC_DATASYNC);
sqe->flags |= IOSQE_FIXED_FILE;
data->iov = { 0 }; data->iov = { 0 };
data->callback = cb; data->callback = cb;
PRIV(op)->min_used_journal_sector = PRIV(op)->max_used_journal_sector = 0; PRIV(op)->min_used_journal_sector = PRIV(op)->max_used_journal_sector = 0;
@ -161,7 +162,8 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op)
if (!disable_journal_fsync) if (!disable_journal_fsync)
{ {
BS_SUBMIT_GET_SQE(sqe, data); BS_SUBMIT_GET_SQE(sqe, data);
my_uring_prep_fsync(sqe, journal.fd, IORING_FSYNC_DATASYNC); my_uring_prep_fsync(sqe, journal_fd_index, IORING_FSYNC_DATASYNC);
sqe->flags |= IOSQE_FIXED_FILE;
data->iov = { 0 }; data->iov = { 0 };
data->callback = cb; data->callback = cb;
PRIV(op)->pending_ops = 1; PRIV(op)->pending_ops = 1;

View File

@ -122,8 +122,9 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
data->iov.iov_len = op->len + stripe_offset + stripe_end; // to check it in the callback data->iov.iov_len = op->len + stripe_offset + stripe_end; // to check it in the callback
data->callback = [this, op](ring_data_t *data) { handle_write_event(data, op); }; data->callback = [this, op](ring_data_t *data) { handle_write_event(data, op); };
my_uring_prep_writev( my_uring_prep_writev(
sqe, data_fd, PRIV(op)->iov_zerofill, vcnt, data_offset + (loc << block_order) + op->offset - stripe_offset sqe, data_fd_index, PRIV(op)->iov_zerofill, vcnt, data_offset + (loc << block_order) + op->offset - stripe_offset
); );
sqe->flags |= IOSQE_FIXED_FILE;
PRIV(op)->pending_ops = 1; PRIV(op)->pending_ops = 1;
PRIV(op)->min_used_journal_sector = PRIV(op)->max_used_journal_sector = 0; PRIV(op)->min_used_journal_sector = PRIV(op)->max_used_journal_sector = 0;
// Remember big write as unsynced // Remember big write as unsynced
@ -198,8 +199,9 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
data2->iov = (struct iovec){ op->buf, op->len }; data2->iov = (struct iovec){ op->buf, op->len };
data2->callback = cb; data2->callback = cb;
my_uring_prep_writev( my_uring_prep_writev(
sqe2, journal.fd, &data2->iov, 1, journal.offset + journal.next_free sqe2, journal_fd_index, &data2->iov, 1, journal.offset + journal.next_free
); );
sqe2->flags |= IOSQE_FIXED_FILE;
PRIV(op)->pending_ops++; PRIV(op)->pending_ops++;
} }
else else

13
osd.cpp
View File

@ -110,6 +110,7 @@ osd_t::osd_t(blockstore_config_t & config, blockstore_t *bs, ring_loop_t *ringlo
close(listen_fd); close(listen_fd);
throw std::runtime_error(std::string("epoll_create: ") + strerror(errno)); throw std::runtime_error(std::string("epoll_create: ") + strerror(errno));
} }
epoll_fd_index = ringloop->register_fd(epoll_fd);
epoll_event ev; epoll_event ev;
ev.data.fd = listen_fd; ev.data.fd = listen_fd;
@ -186,16 +187,19 @@ void osd_t::handle_epoll_events()
throw std::runtime_error("can't get SQE, will fall out of sync with EPOLLET"); throw std::runtime_error("can't get SQE, will fall out of sync with EPOLLET");
} }
ring_data_t *data = ((ring_data_t*)sqe->user_data); ring_data_t *data = ((ring_data_t*)sqe->user_data);
my_uring_prep_poll_add(sqe, epoll_fd, POLLIN); data->allow_cancel = true;
my_uring_prep_poll_add(sqe, epoll_fd_index, POLLIN);
sqe->flags |= IOSQE_FIXED_FILE;
data->callback = [this](ring_data_t *data) data->callback = [this](ring_data_t *data)
{ {
if (data->res < 0) if (data->res < 0 && data->res != -ECANCELED)
{ {
throw std::runtime_error(std::string("epoll failed: ") + strerror(-data->res)); throw std::runtime_error(std::string("epoll failed: ") + strerror(-data->res));
} }
handle_epoll_events(); handle_epoll_events();
}; };
ringloop->submit(); ringloop->submit();
// FIXME With SQ thread we have no guarantee that epoll request will be submitted right here...
int nfds; int nfds;
epoll_event events[MAX_EPOLL_EVENTS]; epoll_event events[MAX_EPOLL_EVENTS];
restart: restart:
@ -219,12 +223,13 @@ restart:
.peer_addr = addr, .peer_addr = addr,
.peer_port = ntohs(addr.sin_port), .peer_port = ntohs(addr.sin_port),
.peer_fd = peer_fd, .peer_fd = peer_fd,
.peer_fd_index = ringloop->register_fd(peer_fd),
.peer_state = PEER_CONNECTED, .peer_state = PEER_CONNECTED,
}; };
// Add FD to epoll // Add FD to epoll
epoll_event ev; epoll_event ev;
ev.data.fd = peer_fd; ev.data.fd = peer_fd;
ev.events = EPOLLIN | EPOLLET | EPOLLRDHUP; ev.events = EPOLLIN | EPOLLRDHUP | EPOLLET;
if (epoll_ctl(epoll_fd, EPOLL_CTL_ADD, peer_fd, &ev) < 0) if (epoll_ctl(epoll_fd, EPOLL_CTL_ADD, peer_fd, &ev) < 0)
{ {
throw std::runtime_error(std::string("epoll_ctl: ") + strerror(errno)); throw std::runtime_error(std::string("epoll_ctl: ") + strerror(errno));
@ -263,7 +268,7 @@ restart:
} }
} }
} }
if (nfds == MAX_EPOLL_EVENTS) if (nfds > 0)
{ {
goto restart; goto restart;
} }

4
osd.h
View File

@ -122,7 +122,7 @@ struct osd_client_t
{ {
sockaddr_in peer_addr; sockaddr_in peer_addr;
int peer_port; int peer_port;
int peer_fd; int peer_fd, peer_fd_index;
int peer_state; int peer_state;
std::function<void(osd_num_t, int)> connect_callback; std::function<void(osd_num_t, int)> connect_callback;
osd_num_t osd_num = 0; osd_num_t osd_num = 0;
@ -196,7 +196,7 @@ class osd_t
timerfd_interval *tick_tfd; timerfd_interval *tick_tfd;
int wait_state = 0; int wait_state = 0;
int epoll_fd = 0; int epoll_fd = 0, epoll_fd_index = -1;
int listen_fd = 0; int listen_fd = 0;
ring_consumer_t consumer; ring_consumer_t consumer;

View File

@ -116,6 +116,7 @@ void osd_t::handle_connect_result(int peer_fd)
int one = 1; int one = 1;
setsockopt(peer_fd, SOL_TCP, TCP_NODELAY, &one, sizeof(one)); setsockopt(peer_fd, SOL_TCP, TCP_NODELAY, &one, sizeof(one));
// Disable EPOLLOUT on this fd // Disable EPOLLOUT on this fd
cl.peer_fd_index = ringloop->register_fd(peer_fd);
cl.connect_callback = NULL; cl.connect_callback = NULL;
cl.peer_state = PEER_CONNECTED; cl.peer_state = PEER_CONNECTED;
epoll_event ev; epoll_event ev;

View File

@ -32,36 +32,38 @@ void osd_t::read_requests()
cl.read_msg.msg_iov = &cl.read_iov; cl.read_msg.msg_iov = &cl.read_iov;
cl.read_msg.msg_iovlen = 1; cl.read_msg.msg_iovlen = 1;
data->callback = [this, peer_fd](ring_data_t *data) { handle_read(data, peer_fd); }; data->callback = [this, peer_fd](ring_data_t *data) { handle_read(data, peer_fd); };
my_uring_prep_recvmsg(sqe, peer_fd, &cl.read_msg, 0); my_uring_prep_recvmsg(sqe, cl.peer_fd_index, &cl.read_msg, 0);
sqe->flags |= IOSQE_FIXED_FILE;
} }
read_ready_clients.clear(); read_ready_clients.clear();
} }
void osd_t::handle_read(ring_data_t *data, int peer_fd) void osd_t::handle_read(ring_data_t *data, int peer_fd)
{ {
int res = data->res;
auto cl_it = clients.find(peer_fd); auto cl_it = clients.find(peer_fd);
if (cl_it != clients.end()) if (cl_it != clients.end())
{ {
auto & cl = cl_it->second; auto & cl = cl_it->second;
if (data->res == -EAGAIN) if (res == -EAGAIN)
{ {
cl.read_ready--; cl.read_ready--;
if (cl.read_ready > 0) if (cl.read_ready > 0)
read_ready_clients.push_back(peer_fd); read_ready_clients.push_back(peer_fd);
return; return;
} }
else if (data->res < 0) else if (res < 0)
{ {
// this is a client socket, so don't panic. just disconnect it // this is a client socket, so don't panic. just disconnect it
printf("Client %d socket read error: %d (%s). Disconnecting client\n", peer_fd, -data->res, strerror(-data->res)); printf("Client %d socket read error: %d (%s). Disconnecting client\n", peer_fd, -res, strerror(-res));
stop_client(peer_fd); stop_client(peer_fd);
return; return;
} }
read_ready_clients.push_back(peer_fd); read_ready_clients.push_back(peer_fd);
if (data->res > 0) if (res > 0)
{ {
cl.read_remaining -= data->res; cl.read_remaining -= res;
cl.read_buf += data->res; cl.read_buf += res;
if (cl.read_remaining <= 0) if (cl.read_remaining <= 0)
{ {
cl.read_buf = NULL; cl.read_buf = NULL;

View File

@ -49,7 +49,8 @@ bool osd_t::try_send(osd_client_t & cl)
cl.write_msg.msg_iov = cl.write_op->send_list.get_iovec(); cl.write_msg.msg_iov = cl.write_op->send_list.get_iovec();
cl.write_msg.msg_iovlen = cl.write_op->send_list.get_size(); cl.write_msg.msg_iovlen = cl.write_op->send_list.get_size();
data->callback = [this, peer_fd](ring_data_t *data) { handle_send(data, peer_fd); }; data->callback = [this, peer_fd](ring_data_t *data) { handle_send(data, peer_fd); };
my_uring_prep_sendmsg(sqe, peer_fd, &cl.write_msg, 0); my_uring_prep_sendmsg(sqe, cl.peer_fd_index, &cl.write_msg, 0);
sqe->flags |= IOSQE_FIXED_FILE;
return true; return true;
} }
@ -69,32 +70,33 @@ void osd_t::send_replies()
void osd_t::handle_send(ring_data_t *data, int peer_fd) void osd_t::handle_send(ring_data_t *data, int peer_fd)
{ {
int res = data->res;
auto cl_it = clients.find(peer_fd); auto cl_it = clients.find(peer_fd);
if (cl_it != clients.end()) if (cl_it != clients.end())
{ {
auto & cl = cl_it->second; auto & cl = cl_it->second;
if (data->res < 0 && data->res != -EAGAIN) if (res < 0 && res != -EAGAIN)
{ {
// this is a client socket, so don't panic. just disconnect it // this is a client socket, so don't panic. just disconnect it
printf("Client %d socket write error: %d (%s). Disconnecting client\n", peer_fd, -data->res, strerror(-data->res)); printf("Client %d socket write error: %d (%s). Disconnecting client\n", peer_fd, -res, strerror(-res));
stop_client(peer_fd); stop_client(peer_fd);
return; return;
} }
if (data->res >= 0) if (res >= 0)
{ {
osd_op_t *cur_op = cl.write_op; osd_op_t *cur_op = cl.write_op;
while (data->res > 0 && cur_op->send_list.sent < cur_op->send_list.count) while (res > 0 && cur_op->send_list.sent < cur_op->send_list.count)
{ {
iovec & iov = cur_op->send_list.buf[cur_op->send_list.sent]; iovec & iov = cur_op->send_list.buf[cur_op->send_list.sent];
if (iov.iov_len <= data->res) if (iov.iov_len <= res)
{ {
data->res -= iov.iov_len; res -= iov.iov_len;
cur_op->send_list.sent++; cur_op->send_list.sent++;
} }
else else
{ {
iov.iov_len -= data->res; iov.iov_len -= res;
iov.iov_base += data->res; iov.iov_base += res;
break; break;
} }
} }

View File

@ -1,14 +1,19 @@
#include <set>
#include "ringloop.h" #include "ringloop.h"
ring_loop_t::ring_loop_t(int qd) ring_loop_t::ring_loop_t(int qd)
{ {
int ret = io_uring_queue_init(qd, &ring, 0); io_uring_params params = { 0 };
params.flags = IORING_SETUP_SQPOLL;
params.sq_thread_idle = 10;
int ret = io_uring_queue_init_params(qd, &ring, &params);
if (ret < 0) if (ret < 0)
{ {
throw std::runtime_error(std::string("io_uring_queue_init: ") + strerror(-ret)); throw std::runtime_error(std::string("io_uring_queue_init: ") + strerror(-ret));
} }
free_ring_data_ptr = *ring.cq.kring_entries; ring_data_total = free_ring_data_ptr = *ring.cq.kring_entries;
ring_datas = (struct ring_data_t*)malloc(sizeof(ring_data_t) * free_ring_data_ptr); ring_datas = (ring_data_t*)malloc(sizeof(ring_data_t) * free_ring_data_ptr);
free_ring_data = (int*)malloc(sizeof(int) * free_ring_data_ptr); free_ring_data = (int*)malloc(sizeof(int) * free_ring_data_ptr);
if (!ring_datas || !free_ring_data) if (!ring_datas || !free_ring_data)
{ {
@ -16,6 +21,7 @@ ring_loop_t::ring_loop_t(int qd)
} }
for (int i = 0; i < free_ring_data_ptr; i++) for (int i = 0; i < free_ring_data_ptr; i++)
{ {
ring_datas[i] = { 0 };
free_ring_data[i] = i; free_ring_data[i] = i;
} }
} }
@ -27,6 +33,105 @@ ring_loop_t::~ring_loop_t()
io_uring_queue_exit(&ring); io_uring_queue_exit(&ring);
} }
void ring_loop_t::drain_events(void *completions_ptr)
{
std::set<ring_data_t*> & completions = *((std::set<ring_data_t*> *)completions_ptr);
if (free_ring_data_ptr < ring_data_total)
{
// Try to cancel requests that are allowed to be canceled by the caller (epoll, timerfd and similar)
for (int i = 0; i < ring_data_total; i++)
{
if (ring_datas[i].allow_cancel)
{
// allow_cancel may only be true while the operation is inflight
io_uring_sqe *sqe = get_sqe();
if (!sqe)
{
throw std::runtime_error("can't get SQE to cancel operation");
}
ring_data_t *data = (ring_data_t*)sqe->user_data;
data->callback = NULL;
ring_datas[i].res = -ECANCELED;
my_uring_prep_cancel(sqe, &ring_datas[i], 0);
// It seems (FIXME) cancel operations don't always get completions
completions.insert(data);
}
}
if (completions.size() > 0)
{
submit();
}
}
int inflight = ring_data_total - free_ring_data_ptr;
while (completions.size() < inflight)
{
io_uring_cqe *cqe;
while (!io_uring_peek_cqe(&ring, &cqe))
{
ring_data_t *d = (ring_data_t*)cqe->user_data;
d->res = cqe->res;
d->allow_cancel = false;
completions.insert(d);
io_uring_cqe_seen(&ring, cqe);
}
if (completions.size() < inflight)
{
wait();
}
}
}
void ring_loop_t::run_completions(void *completions_ptr)
{
std::set<ring_data_t*> & completions = *((std::set<ring_data_t*> *)completions_ptr);
// Call event callbacks
for (ring_data_t *d: completions)
{
free_ring_data[free_ring_data_ptr++] = d - ring_datas;
if (d->callback)
d->callback(d);
}
}
int ring_loop_t::register_fd(int fd)
{
std::set<ring_data_t*> completions;
drain_events((void*)&completions);
// Modify registered files
int idx = reg_fds.size();
reg_fds.push_back(fd);
if (registered)
{
io_uring_unregister_files(&ring);
}
int ret = io_uring_register_files(&ring, reg_fds.data(), reg_fds.size());
if (ret != 0)
{
throw std::runtime_error(std::string("io_uring_register_files_update: ") + strerror(-ret));
}
registered = 1;
run_completions((void*)&completions);
return idx;
}
void ring_loop_t::unregister_fd(int fd_index)
{
std::set<ring_data_t*> completions;
drain_events((void*)&completions);
// Modify registered files
reg_fds.erase(reg_fds.begin()+fd_index, reg_fds.begin()+fd_index+1);
if (registered)
{
io_uring_unregister_files(&ring);
}
int ret = io_uring_register_files(&ring, reg_fds.data(), reg_fds.size());
if (ret != 0)
{
throw std::runtime_error(std::string("io_uring_register_files_update: ") + strerror(-ret));
}
run_completions((void*)&completions);
}
int ring_loop_t::register_consumer(ring_consumer_t & consumer) int ring_loop_t::register_consumer(ring_consumer_t & consumer)
{ {
consumer.number = consumers.size(); consumer.number = consumers.size();
@ -50,17 +155,16 @@ void ring_loop_t::unregister_consumer(ring_consumer_t & consumer)
void ring_loop_t::loop() void ring_loop_t::loop()
{ {
struct io_uring_cqe *cqe; io_uring_cqe *cqe;
while (!io_uring_peek_cqe(&ring, &cqe)) while (!io_uring_peek_cqe(&ring, &cqe))
{ {
struct ring_data_t *d = (struct ring_data_t*)cqe->user_data; ring_data_t *d = (ring_data_t*)cqe->user_data;
if (d->callback) d->res = cqe->res;
{ d->allow_cancel = false;
d->res = cqe->res;
d->callback(d);
}
free_ring_data[free_ring_data_ptr++] = d - ring_datas;
io_uring_cqe_seen(&ring, cqe); io_uring_cqe_seen(&ring, cqe);
free_ring_data[free_ring_data_ptr++] = d - ring_datas;
if (d->callback)
d->callback(d);
} }
do do
{ {

View File

@ -107,6 +107,7 @@ static inline void my_uring_prep_cancel(struct io_uring_sqe *sqe, void *user_dat
struct ring_data_t struct ring_data_t
{ {
struct iovec iov; // for single-entry read/write operations struct iovec iov; // for single-entry read/write operations
bool allow_cancel;
int res; int res;
std::function<void(ring_data_t*)> callback; std::function<void(ring_data_t*)> callback;
}; };
@ -122,22 +123,35 @@ class ring_loop_t
std::vector<ring_consumer_t> consumers; std::vector<ring_consumer_t> consumers;
struct ring_data_t *ring_datas; struct ring_data_t *ring_datas;
int *free_ring_data; int *free_ring_data;
unsigned free_ring_data_ptr; unsigned free_ring_data_ptr, ring_data_total;
bool loop_again; bool loop_again;
struct io_uring ring; struct io_uring ring;
int registered = 0;
std::vector<int> reg_fds;
void drain_events(void *completions_ptr);
void run_completions(void *completions_ptr);
public: public:
ring_loop_t(int qd); ring_loop_t(int qd);
~ring_loop_t(); ~ring_loop_t();
int register_consumer(ring_consumer_t & consumer); int register_consumer(ring_consumer_t & consumer);
void unregister_consumer(ring_consumer_t & consumer); void unregister_consumer(ring_consumer_t & consumer);
int register_fd(int fd);
void unregister_fd(int fd_index);
inline struct io_uring_sqe* get_sqe() inline struct io_uring_sqe* get_sqe()
{ {
if (free_ring_data_ptr == 0) if (free_ring_data_ptr == 0)
{
return NULL; return NULL;
}
struct io_uring_sqe* sqe = io_uring_get_sqe(&ring); struct io_uring_sqe* sqe = io_uring_get_sqe(&ring);
if (sqe) if (sqe)
io_uring_sqe_set_data(sqe, ring_datas + free_ring_data[--free_ring_data_ptr]); {
ring_data_t *data = ring_datas + free_ring_data[--free_ring_data_ptr];
io_uring_sqe_set_data(sqe, data);
}
return sqe; return sqe;
} }
inline int submit() inline int submit()

View File

@ -21,6 +21,7 @@ timerfd_interval::timerfd_interval(ring_loop_t *ringloop, int seconds, std::func
} }
consumer.loop = [this]() { loop(); }; consumer.loop = [this]() { loop(); };
ringloop->register_consumer(consumer); ringloop->register_consumer(consumer);
timerfd_index = ringloop->register_fd(timerfd);
this->ringloop = ringloop; this->ringloop = ringloop;
this->callback = cb; this->callback = cb;
} }
@ -44,10 +45,12 @@ void timerfd_interval::loop()
return; return;
} }
struct ring_data_t *data = ((ring_data_t*)sqe->user_data); struct ring_data_t *data = ((ring_data_t*)sqe->user_data);
my_uring_prep_poll_add(sqe, timerfd, POLLIN); my_uring_prep_poll_add(sqe, timerfd_index, POLLIN);
sqe->flags |= IOSQE_FIXED_FILE;
data->allow_cancel = true;
data->callback = [&](ring_data_t *data) data->callback = [&](ring_data_t *data)
{ {
if (data->res < 0) if (data->res < 0 && data->res != -ECANCELED)
{ {
throw std::runtime_error(std::string("waiting for timer failed: ") + strerror(-data->res)); throw std::runtime_error(std::string("waiting for timer failed: ") + strerror(-data->res));
} }

View File

@ -5,7 +5,7 @@
class timerfd_interval class timerfd_interval
{ {
int wait_state; int wait_state;
int timerfd; int timerfd, timerfd_index;
int status; int status;
ring_loop_t *ringloop; ring_loop_t *ringloop;
ring_consumer_t consumer; ring_consumer_t consumer;