diff --git a/cluster_client.cpp b/cluster_client.cpp index 64465bc4..c8c28cf8 100644 --- a/cluster_client.cpp +++ b/cluster_client.cpp @@ -101,16 +101,22 @@ void cluster_client_t::stop() } } -void cluster_client_t::continue_ops() +void cluster_client_t::continue_ops(bool up_retry) { - if (retry_timeout_id) - { - tfd->clear_timer(retry_timeout_id); - retry_timeout_id = 0; - } for (auto op_it = cur_ops.begin(); op_it != cur_ops.end(); ) { - continue_rw(*op_it++); + if ((*op_it)->up_wait) + { + if (up_retry) + { + (*op_it)->up_wait = false; + continue_rw(*op_it++); + } + else + op_it++; + } + else + continue_rw(*op_it++); } } @@ -173,6 +179,15 @@ void cluster_client_t::on_load_config_hook(json11::Json::object & config) { client_dirty_limit = DEFAULT_CLIENT_DIRTY_LIMIT; } + up_wait_retry_interval = config["up_wait_retry_interval"].uint64_value(); + if (!up_wait_retry_interval) + { + up_wait_retry_interval = 500; + } + else if (up_wait_retry_interval < 50) + { + up_wait_retry_interval = 50; + } msgr.peer_connect_interval = config["peer_connect_interval"].uint64_value(); if (!msgr.peer_connect_interval) { @@ -696,9 +711,17 @@ void cluster_client_t::handle_op_part(cluster_op_part_t *part) part->osd_num, part->op.reply.hdr.retval, expected ); msgr.stop_client(part->op.peer_fd); - if (part->op.reply.hdr.retval && !retry_timeout_id) + if (part->op.reply.hdr.retval == -EPIPE) { - retry_timeout_id = tfd->set_timer(up_wait_retry_interval, false, [this](int) { retry_timeout_id = 0; continue_ops(); }); + op->up_wait = true; + if (!retry_timeout_id) + { + retry_timeout_id = tfd->set_timer(up_wait_retry_interval, false, [this](int) + { + retry_timeout_id = 0; + continue_ops(true); + }); + } } if (!op->retval || op->retval == -EPIPE) { diff --git a/cluster_client.h b/cluster_client.h index 4358b3d6..1ed4335a 100644 --- a/cluster_client.h +++ b/cluster_client.h @@ -40,6 +40,7 @@ protected: cluster_op_t *orig_op = NULL; bool is_internal = false; bool needs_reslice = false; + bool up_wait = false; int sent_count = 0, done_count = 0; std::vector parts; friend class cluster_client_t; @@ -59,7 +60,6 @@ class cluster_client_t // FIXME: Implement inmemory_commit mode. Note that it requires to return overlapping reads from memory. uint64_t client_dirty_limit = 0; int log_level; - // FIXME: Put up_wait_retry_interval into config and fix it so it could actually work int up_wait_retry_interval = 500; // ms uint64_t op_id = 1; @@ -85,7 +85,7 @@ public: void stop(); protected: - void continue_ops(); + void continue_ops(bool up_retry = false); void on_load_config_hook(json11::Json::object & config); void on_load_pgs_hook(bool success); void on_change_hook(json11::Json::object & changes); diff --git a/lp/mon.js b/lp/mon.js index aaff3ade..00cb9b23 100644 --- a/lp/mon.js +++ b/lp/mon.js @@ -30,11 +30,11 @@ class Mon /* global: { // mon etcd_mon_ttl: 30, // min: 10 - etcd_mon_timeout: 1000, // min: 0 + etcd_mon_timeout: 1000, // ms. min: 0 etcd_mon_retries: 5, // min: 0 - mon_change_timeout: 1000, // min: 100 - mon_stats_timeout: 1000, // min: 100 - osd_out_time: 1800, // min: 0 + mon_change_timeout: 1000, // ms. min: 100 + mon_stats_timeout: 1000, // ms. min: 100 + osd_out_time: 1800, // seconds. min: 0 placement_levels: { datacenter: 1, rack: 2, host: 3, osd: 4, ... }, // client and osd use_sync_send_recv: false, @@ -45,8 +45,9 @@ class Mon pg_stripe_size: 4194304, immediate_commit: false, // 'all' or 'small' client_dirty_limit: 33554432, - peer_connect_interval: 5, - peer_connect_timeout: 5, + peer_connect_interval: 5, // seconds. min: 1 + peer_connect_timeout: 5, // seconds. min: 1 + up_wait_retry_interval: 500, // ms. min: 50 // osd etcd_report_interval: 30, // min: 10 run_primary: true,