From 9e6cbc6ebc72673a6261e31d4b04679f9b46a769 Mon Sep 17 00:00:00 2001 From: Vitaliy Filippov Date: Thu, 29 Apr 2021 00:12:06 +0300 Subject: [PATCH] Negotiate max_sge between RDMA client & server --- src/fio_cluster.cpp | 46 ++++++++++++++++++++++++++++++++++++++++++- src/messenger.cpp | 24 +++++++++++++--------- src/messenger.h | 5 +++-- src/msgr_rdma.cpp | 10 ++++++---- src/osd_secondary.cpp | 4 ++-- 5 files changed, 71 insertions(+), 18 deletions(-) diff --git a/src/fio_cluster.cpp b/src/fio_cluster.cpp index c6bc05c8..68855bf4 100644 --- a/src/fio_cluster.cpp +++ b/src/fio_cluster.cpp @@ -54,6 +54,7 @@ struct sec_options int cluster_log = 0; int trace = 0; int use_rdma = 0; + char *rdma_device = NULL; int rdma_port_num = 0; int rdma_gid_index = 0; int rdma_mtu = 0; @@ -127,7 +128,7 @@ static struct fio_option options[] = { }, { .name = "use_rdma", - .lname = "OSD trace", + .lname = "Use RDMA", .type = FIO_OPT_BOOL, .off1 = offsetof(struct sec_options, use_rdma), .help = "Use RDMA", @@ -135,6 +136,45 @@ static struct fio_option options[] = { .category = FIO_OPT_C_ENGINE, .group = FIO_OPT_G_FILENAME, }, + { + .name = "rdma_device", + .lname = "RDMA device name", + .type = FIO_OPT_STR_STORE, + .off1 = offsetof(struct sec_options, rdma_device), + .help = "RDMA device name", + .category = FIO_OPT_C_ENGINE, + .group = FIO_OPT_G_FILENAME, + }, + { + .name = "rdma_port_num", + .lname = "RDMA port number", + .type = FIO_OPT_INT, + .off1 = offsetof(struct sec_options, rdma_port_num), + .help = "RDMA port number", + .def = "0", + .category = FIO_OPT_C_ENGINE, + .group = FIO_OPT_G_FILENAME, + }, + { + .name = "rdma_gid_index", + .lname = "RDMA gid index", + .type = FIO_OPT_INT, + .off1 = offsetof(struct sec_options, rdma_gid_index), + .help = "RDMA gid index", + .def = "0", + .category = FIO_OPT_C_ENGINE, + .group = FIO_OPT_G_FILENAME, + }, + { + .name = "rdma_mtu", + .lname = "RDMA path MTU", + .type = FIO_OPT_INT, + .off1 = offsetof(struct sec_options, rdma_mtu), + .help = "RDMA path MTU", + .def = "0", + .category = FIO_OPT_C_ENGINE, + .group = FIO_OPT_G_FILENAME, + }, { .name = NULL, }, @@ -171,6 +211,10 @@ static int sec_setup(struct thread_data *td) { "etcd_prefix", std::string(o->etcd_prefix ? o->etcd_prefix : "/vitastor") }, { "log_level", o->cluster_log }, { "use_rdma", o->use_rdma }, + { "rdma_device", std::string(o->rdma_device ? o->rdma_device : "") }, + { "rdma_port_num", o->rdma_port_num }, + { "rdma_gid_index", o->rdma_gid_index }, + { "rdma_mtu", o->rdma_mtu }, }; if (!o->image) diff --git a/src/messenger.cpp b/src/messenger.cpp index d76c70eb..c0003ab3 100644 --- a/src/messenger.cpp +++ b/src/messenger.cpp @@ -25,6 +25,8 @@ void osd_messenger_t::init() } else { + rdma_max_sge = rdma_max_sge < rdma_context->attrx.orig_attr.max_sge + ? rdma_max_sge : rdma_context->attrx.orig_attr.max_sge; printf("[OSD %lu] RDMA initialized successfully\n", osd_num); fcntl(rdma_context->channel->fd, F_SETFL, fcntl(rdma_context->channel->fd, F_GETFL, 0) | O_NONBLOCK); tfd->set_fd_handler(rdma_context->channel->fd, false, [this](int notify_fd, int epoll_events) @@ -356,9 +358,6 @@ void osd_messenger_t::on_connect_peer(osd_num_t peer_osd, int peer_fd) void osd_messenger_t::check_peer_config(osd_client_t *cl) { -#ifdef WITH_RDMA - msgr_rdma_connection_t *rdma_conn = NULL; -#endif osd_op_t *op = new osd_op_t(); op->op_type = OSD_OP_OUT; op->peer_fd = cl->peer_fd; @@ -374,11 +373,12 @@ void osd_messenger_t::check_peer_config(osd_client_t *cl) #ifdef WITH_RDMA if (rdma_context) { - cl->rdma_conn = msgr_rdma_connection_t::create(rdma_context, max_rdma_send, max_rdma_recv, max_rdma_sge); + cl->rdma_conn = msgr_rdma_connection_t::create(rdma_context, rdma_max_send, rdma_max_recv, rdma_max_sge); if (cl->rdma_conn) { json11::Json payload = json11::Json::object { { "connect_rdma", cl->rdma_conn->addr.to_string() }, + { "rdma_max_sge", rdma_max_sge }, }; std::string payload_str = payload.dump(); op->req.show_conf.json_len = payload_str.size(); @@ -388,11 +388,7 @@ void osd_messenger_t::check_peer_config(osd_client_t *cl) } } #endif - op->callback = [this, cl -#ifdef WITH_RDMA - , rdma_conn -#endif - ](osd_op_t *op) + op->callback = [this, cl](osd_op_t *op) { std::string json_err; json11::Json config; @@ -455,6 +451,11 @@ void osd_messenger_t::check_peer_config(osd_client_t *cl) } else { + uint64_t server_max_sge = config["rdma_max_sge"].uint64_value(); + if (cl->rdma_conn->max_sge > server_max_sge) + { + cl->rdma_conn->max_sge = server_max_sge; + } printf("Connected to OSD %lu using RDMA\n", cl->osd_num); cl->peer_state = PEER_RDMA; tfd->set_fd_handler(cl->peer_fd, false, NULL); @@ -509,3 +510,8 @@ bool osd_messenger_t::is_rdma_enabled() { return rdma_context != NULL; } + +uint64_t osd_messenger_t::get_rdma_max_sge() +{ + return rdma_max_sge; +} diff --git a/src/messenger.h b/src/messenger.h index 6b312159..4c485a08 100644 --- a/src/messenger.h +++ b/src/messenger.h @@ -137,7 +137,7 @@ protected: std::string rdma_device; uint64_t rdma_port_num = 1, rdma_gid_index = 0, rdma_mtu = 0; msgr_rdma_context_t *rdma_context = NULL; - int max_rdma_sge = 128, max_rdma_send = 32, max_rdma_recv = 32; + uint64_t rdma_max_sge = 128, rdma_max_send = 32, rdma_max_recv = 32; #endif std::vector read_ready_clients; @@ -170,7 +170,8 @@ public: #ifdef WITH_RDMA bool is_rdma_enabled(); - bool connect_rdma(int peer_fd, std::string rdma_address); + bool connect_rdma(int peer_fd, std::string rdma_address, uint64_t client_max_sge); + uint64_t get_rdma_max_sge(); #endif protected: diff --git a/src/msgr_rdma.cpp b/src/msgr_rdma.cpp index bcc48ffb..e9184374 100644 --- a/src/msgr_rdma.cpp +++ b/src/msgr_rdma.cpp @@ -293,13 +293,17 @@ int msgr_rdma_connection_t::connect(msgr_rdma_address_t *dest) return 0; } -bool osd_messenger_t::connect_rdma(int peer_fd, std::string rdma_address) +bool osd_messenger_t::connect_rdma(int peer_fd, std::string rdma_address, uint64_t client_max_sge) { // Try to connect to the peer using RDMA msgr_rdma_address_t addr; if (msgr_rdma_address_t::from_string(rdma_address.c_str(), &addr)) { - auto rdma_conn = msgr_rdma_connection_t::create(rdma_context, max_rdma_send, max_rdma_recv, max_rdma_sge); + if (client_max_sge > rdma_max_sge) + { + client_max_sge = rdma_max_sge; + } + auto rdma_conn = msgr_rdma_connection_t::create(rdma_context, rdma_max_send, rdma_max_recv, rdma_max_sge); if (rdma_conn) { int r = rdma_conn->connect(&addr); @@ -352,7 +356,6 @@ bool osd_messenger_t::try_send_rdma(osd_client_t *cl) return true; } int op_size = 0, op_sge = 0, op_max = rc->max_sge*bs_bitmap_granularity; - // FIXME: rc->max_sge should be negotiated between client & server ibv_sge sge[rc->max_sge]; while (rc->send_pos < cl->send_list.size()) { @@ -448,7 +451,6 @@ bool osd_messenger_t::try_recv_rdma(osd_client_t *cl) } int op_size = 0, op_sge = 0, op_max = rc->max_sge*bs_bitmap_granularity; iovec *segments = cl->recv_list.get_iovec(); - // FIXME: rc->max_sge should be negotiated between client & server ibv_sge sge[rc->max_sge]; while (rc->recv_pos < cl->recv_list.get_size()) { diff --git a/src/osd_secondary.cpp b/src/osd_secondary.cpp index 106b61de..a674ac9d 100644 --- a/src/osd_secondary.cpp +++ b/src/osd_secondary.cpp @@ -169,11 +169,11 @@ void osd_t::exec_show_config(osd_op_t *cur_op) if (req_json["connect_rdma"].is_string()) { // Peer is trying to connect using RDMA, try to satisfy him - bool ok = msgr.connect_rdma(cur_op->peer_fd, req_json["connect_rdma"].string_value()); + bool ok = msgr.connect_rdma(cur_op->peer_fd, req_json["connect_rdma"].string_value(), req_json["rdma_max_sge"].uint64_value()); if (ok) { - wire_config["rdma_connected"] = true; wire_config["rdma_address"] = msgr.clients.at(cur_op->peer_fd)->rdma_conn->addr.to_string(); + wire_config["rdma_max_sge"] = msgr.get_rdma_max_sge(); } } }