Add up_wait_retry_interval to config and fix it so it actually works

Vitaliy Filippov 2020-09-05 22:05:21 +03:00
parent 44973e7f27
commit 73e26dbbea
3 changed files with 41 additions and 17 deletions

View File

@ -101,16 +101,22 @@ void cluster_client_t::stop()
}
}
void cluster_client_t::continue_ops()
void cluster_client_t::continue_ops(bool up_retry)
{
if (retry_timeout_id)
{
tfd->clear_timer(retry_timeout_id);
retry_timeout_id = 0;
}
for (auto op_it = cur_ops.begin(); op_it != cur_ops.end(); )
{
continue_rw(*op_it++);
if ((*op_it)->up_wait)
{
if (up_retry)
{
(*op_it)->up_wait = false;
continue_rw(*op_it++);
}
else
op_it++;
}
else
continue_rw(*op_it++);
}
}
@ -173,6 +179,15 @@ void cluster_client_t::on_load_config_hook(json11::Json::object & config)
{
client_dirty_limit = DEFAULT_CLIENT_DIRTY_LIMIT;
}
up_wait_retry_interval = config["up_wait_retry_interval"].uint64_value();
if (!up_wait_retry_interval)
{
up_wait_retry_interval = 500;
}
else if (up_wait_retry_interval < 50)
{
up_wait_retry_interval = 50;
}
msgr.peer_connect_interval = config["peer_connect_interval"].uint64_value();
if (!msgr.peer_connect_interval)
{
@ -696,9 +711,17 @@ void cluster_client_t::handle_op_part(cluster_op_part_t *part)
part->osd_num, part->op.reply.hdr.retval, expected
);
msgr.stop_client(part->op.peer_fd);
if (part->op.reply.hdr.retval && !retry_timeout_id)
if (part->op.reply.hdr.retval == -EPIPE)
{
retry_timeout_id = tfd->set_timer(up_wait_retry_interval, false, [this](int) { retry_timeout_id = 0; continue_ops(); });
op->up_wait = true;
if (!retry_timeout_id)
{
retry_timeout_id = tfd->set_timer(up_wait_retry_interval, false, [this](int)
{
retry_timeout_id = 0;
continue_ops(true);
});
}
}
if (!op->retval || op->retval == -EPIPE)
{

View File

@ -40,6 +40,7 @@ protected:
cluster_op_t *orig_op = NULL;
bool is_internal = false;
bool needs_reslice = false;
bool up_wait = false;
int sent_count = 0, done_count = 0;
std::vector<cluster_op_part_t> parts;
friend class cluster_client_t;
@ -59,7 +60,6 @@ class cluster_client_t
// FIXME: Implement inmemory_commit mode. Note that it requires to return overlapping reads from memory.
uint64_t client_dirty_limit = 0;
int log_level;
// FIXME: Put up_wait_retry_interval into config and fix it so it could actually work
int up_wait_retry_interval = 500; // ms
uint64_t op_id = 1;
@ -85,7 +85,7 @@ public:
void stop();
protected:
void continue_ops();
void continue_ops(bool up_retry = false);
void on_load_config_hook(json11::Json::object & config);
void on_load_pgs_hook(bool success);
void on_change_hook(json11::Json::object & changes);

View File

@ -30,11 +30,11 @@ class Mon
/* global: {
// mon
etcd_mon_ttl: 30, // min: 10
etcd_mon_timeout: 1000, // min: 0
etcd_mon_timeout: 1000, // ms. min: 0
etcd_mon_retries: 5, // min: 0
mon_change_timeout: 1000, // min: 100
mon_stats_timeout: 1000, // min: 100
osd_out_time: 1800, // min: 0
mon_change_timeout: 1000, // ms. min: 100
mon_stats_timeout: 1000, // ms. min: 100
osd_out_time: 1800, // seconds. min: 0
placement_levels: { datacenter: 1, rack: 2, host: 3, osd: 4, ... },
// client and osd
use_sync_send_recv: false,
@ -45,8 +45,9 @@ class Mon
pg_stripe_size: 4194304,
immediate_commit: false, // 'all' or 'small'
client_dirty_limit: 33554432,
peer_connect_interval: 5,
peer_connect_timeout: 5,
peer_connect_interval: 5, // seconds. min: 1
peer_connect_timeout: 5, // seconds. min: 1
up_wait_retry_interval: 500, // ms. min: 50
// osd
etcd_report_interval: 30, // min: 10
run_primary: true,