From 9f2a94871286037275ed4fda5c2b5dfcda6c5c31 Mon Sep 17 00:00:00 2001 From: Vitaliy Filippov Date: Thu, 1 Oct 2020 18:51:49 +0300 Subject: [PATCH] Make pg_stripe_size a per-pool config --- cluster_client.cpp | 11 +---------- cluster_client.h | 2 -- etcd_state_client.cpp | 5 +++++ etcd_state_client.h | 3 +++ mon/mon.js | 2 +- osd.cpp | 6 ------ osd.h | 4 +--- osd_peering.cpp | 7 ++++--- osd_primary.cpp | 8 +++++--- pg_states.h | 3 ++- 10 files changed, 22 insertions(+), 29 deletions(-) diff --git a/cluster_client.cpp b/cluster_client.cpp index b0a7edb6..90bb86b8 100644 --- a/cluster_client.cpp +++ b/cluster_client.cpp @@ -160,15 +160,6 @@ void cluster_client_t::on_load_config_hook(json11::Json::object & config) { throw std::runtime_error("Bad block size"); } - // FIXME: pg_stripe_size may be a per-pool config - if (config.find("pg_stripe_size") != config.end()) - { - pg_stripe_size = config["pg_stripe_size"].uint64_value(); - } - if (!pg_stripe_size) - { - pg_stripe_size = DEFAULT_PG_STRIPE_SIZE; - } if (config["immediate_commit"] == "all") { // Cluster-wide immediate_commit mode @@ -473,7 +464,7 @@ void cluster_client_t::slice_rw(cluster_op_t *op) int i = 0; for (uint64_t stripe = first_stripe; stripe <= last_stripe; stripe += pg_block_size) { - pg_num_t pg_num = (op->inode + stripe/pg_stripe_size) % pool_cfg.real_pg_count + 1; + pg_num_t pg_num = (op->inode + stripe/pool_cfg.pg_stripe_size) % pool_cfg.real_pg_count + 1; uint64_t begin = (op->offset < stripe ? stripe : op->offset); uint64_t end = (op->offset + op->len) > (stripe + pg_block_size) ? (stripe + pg_block_size) : (op->offset + op->len); diff --git a/cluster_client.h b/cluster_client.h index 7516a723..fb36fefd 100644 --- a/cluster_client.h +++ b/cluster_client.h @@ -9,7 +9,6 @@ #define MIN_BLOCK_SIZE 4*1024 #define MAX_BLOCK_SIZE 128*1024*1024 #define DEFAULT_BLOCK_SIZE 128*1024 -#define DEFAULT_PG_STRIPE_SIZE 4*1024*1024 #define DEFAULT_DISK_ALIGNMENT 4096 #define DEFAULT_BITMAP_GRANULARITY 4096 #define DEFAULT_CLIENT_DIRTY_LIMIT 32*1024*1024 @@ -54,7 +53,6 @@ class cluster_client_t timerfd_manager_t *tfd; ring_loop_t *ringloop; - uint64_t pg_stripe_size = 0; uint64_t bs_block_size = 0; uint64_t bs_disk_alignment = 0; uint64_t bs_bitmap_granularity = 0; diff --git a/etcd_state_client.cpp b/etcd_state_client.cpp index 310f5411..a986ab93 100644 --- a/etcd_state_client.cpp +++ b/etcd_state_client.cpp @@ -358,6 +358,11 @@ void etcd_state_client_t::parse_state(const std::string & key, const json11::Jso parsed_cfg.pg_minsize = pool_item.second["pg_minsize"].uint64_value(); parsed_cfg.pg_count = pool_item.second["pg_count"].uint64_value(); parsed_cfg.failure_domain = pool_item.second["failure_domain"].string_value(); + parsed_cfg.pg_stripe_size = pool_item.second["pg_stripe_size"].uint64_value(); + if (!parsed_cfg.pg_stripe_size) + { + parsed_cfg.pg_stripe_size = DEFAULT_PG_STRIPE_SIZE; + } parsed_cfg.max_osd_combinations = pool_item.second["max_osd_combinations"].uint64_value(); if (!parsed_cfg.max_osd_combinations) { diff --git a/etcd_state_client.h b/etcd_state_client.h index c0f0d37f..2f6a92f9 100644 --- a/etcd_state_client.h +++ b/etcd_state_client.h @@ -16,6 +16,8 @@ #define ETCD_SLOW_TIMEOUT 5000 #define ETCD_QUICK_TIMEOUT 1000 +#define DEFAULT_PG_STRIPE_SIZE 4*1024*1024 + struct json_kv_t { std::string key; @@ -46,6 +48,7 @@ struct pool_config_t uint64_t real_pg_count; std::string failure_domain; uint64_t max_osd_combinations; + uint64_t pg_stripe_size; std::map pg_config; }; diff --git a/mon/mon.js b/mon/mon.js index 2b96af0a..286674ef 100644 --- a/mon/mon.js +++ b/mon/mon.js @@ -45,7 +45,6 @@ class Mon block_size: 131072, disk_alignment: 4096, bitmap_granularity: 4096, - pg_stripe_size: 4194304, immediate_commit: false, // 'all' or 'small' client_dirty_limit: 33554432, peer_connect_interval: 5, // seconds. min: 1 @@ -101,6 +100,7 @@ class Mon pg_count: 100, failure_domain: 'host', max_osd_combinations: 10000, + pg_stripe_size: 4194304, // FIXME add device classes/tags }, ... diff --git a/osd.cpp b/osd.cpp index a72ca354..6619d729 100644 --- a/osd.cpp +++ b/osd.cpp @@ -83,12 +83,6 @@ void osd_t::parse_config(blockstore_config_t & config) if (client_queue_depth < 128) client_queue_depth = 128; } - if (config.find("pg_stripe_size") != config.end()) - { - pg_stripe_size = strtoull(config["pg_stripe_size"].c_str(), NULL, 10); - if (!pg_stripe_size || !bs_block_size || pg_stripe_size < bs_block_size || (pg_stripe_size % bs_block_size) != 0) - pg_stripe_size = DEFAULT_PG_STRIPE_SIZE; - } recovery_queue_depth = strtoull(config["recovery_queue_depth"].c_str(), NULL, 10); if (recovery_queue_depth < 1 || recovery_queue_depth > MAX_RECOVERY_QUEUE) recovery_queue_depth = DEFAULT_RECOVERY_QUEUE; diff --git a/osd.h b/osd.h index 3bb95cb8..f5dc1997 100644 --- a/osd.h +++ b/osd.h @@ -37,7 +37,6 @@ #define DEFAULT_AUTOSYNC_INTERVAL 5 #define MAX_RECOVERY_QUEUE 2048 #define DEFAULT_RECOVERY_QUEUE 4 -#define DEFAULT_PG_STRIPE_SIZE 4*1024*1024 // 4 MB by default //#define OSD_STUB @@ -110,7 +109,6 @@ class osd_t int inflight_ops = 0; blockstore_t *bs; uint32_t bs_block_size, bs_disk_alignment; - uint64_t pg_stripe_size = DEFAULT_PG_STRIPE_SIZE; ring_loop_t *ringloop; timerfd_manager_t *tfd = NULL; epoll_manager_t *epmgr = NULL; @@ -201,7 +199,7 @@ class osd_t void submit_primary_sync_subops(osd_op_t *cur_op); void submit_primary_stab_subops(osd_op_t *cur_op); - inline pg_num_t map_to_pg(object_id oid) + inline pg_num_t map_to_pg(object_id oid, uint64_t pg_stripe_size) { uint64_t pg_count = pg_counts[INODE_POOL(oid.inode)]; if (!pg_count) diff --git a/osd_peering.cpp b/osd_peering.cpp index b063074d..9f911273 100644 --- a/osd_peering.cpp +++ b/osd_peering.cpp @@ -125,10 +125,11 @@ void osd_t::start_pg_peering(pg_t & pg) cancel_primary_write(p.second); } pg.write_queue.clear(); + uint64_t pg_stripe_size = st_cli.pool_config[pg.pool_id].pg_stripe_size; for (auto it = unstable_writes.begin(); it != unstable_writes.end(); ) { // Forget this PG's unstable writes - if (INODE_POOL(it->first.oid.inode) == pg.pool_id && map_to_pg(it->first.oid) == pg.pg_num) + if (INODE_POOL(it->first.oid.inode) == pg.pool_id && map_to_pg(it->first.oid, pg_stripe_size) == pg.pg_num) unstable_writes.erase(it++); else it++; @@ -348,7 +349,7 @@ void osd_t::submit_list_subop(osd_num_t role_osd, pg_peering_state_t *ps) clock_gettime(CLOCK_REALTIME, &op->tv_begin); op->bs_op = new blockstore_op_t(); op->bs_op->opcode = BS_OP_LIST; - op->bs_op->oid.stripe = pg_stripe_size; + op->bs_op->oid.stripe = st_cli.pool_config[ps->pool_id].pg_stripe_size; op->bs_op->oid.inode = ((uint64_t)ps->pool_id << (64 - POOL_ID_BITS)); op->bs_op->version = ((uint64_t)(ps->pool_id+1) << (64 - POOL_ID_BITS)) - 1; op->bs_op->len = pg_counts[ps->pool_id]; @@ -392,7 +393,7 @@ void osd_t::submit_list_subop(osd_num_t role_osd, pg_peering_state_t *ps) }, .list_pg = ps->pg_num, .pg_count = pg_counts[ps->pool_id], - .pg_stripe_size = pg_stripe_size, + .pg_stripe_size = st_cli.pool_config[ps->pool_id].pg_stripe_size, .min_inode = ((uint64_t)(ps->pool_id) << (64 - POOL_ID_BITS)), .max_inode = ((uint64_t)(ps->pool_id+1) << (64 - POOL_ID_BITS)) - 1, }, diff --git a/osd_primary.cpp b/osd_primary.cpp index 84e85349..6d406191 100644 --- a/osd_primary.cpp +++ b/osd_primary.cpp @@ -25,8 +25,7 @@ bool osd_t::prepare_primary_rw(osd_op_t *cur_op) // oid.stripe = starting offset of the parity stripe .stripe = (cur_op->req.rw.offset/pg_block_size)*pg_block_size, }; - // FIXME: pg_stripe_size may be a per-pool config - pg_num_t pg_num = (cur_op->req.rw.inode + oid.stripe/pg_stripe_size) % pg_counts[pool_id] + 1; + pg_num_t pg_num = (cur_op->req.rw.inode + oid.stripe/pool_cfg.pg_stripe_size) % pg_counts[pool_id] + 1; auto pg_it = pgs.find({ .pool_id = pool_id, .pg_num = pg_num }); if (pg_it == pgs.end() || !(pg_it->second.state & PG_ACTIVE)) { @@ -604,7 +603,10 @@ resume_6: { // Except those from peered PGs auto & w = op_data->unstable_writes[i]; - pool_pg_num_t wpg = { .pool_id = INODE_POOL(w.oid.inode), .pg_num = map_to_pg(w.oid) }; + pool_pg_num_t wpg = { + .pool_id = INODE_POOL(w.oid.inode), + .pg_num = map_to_pg(w.oid, st_cli.pool_config.at(INODE_POOL(w.oid.inode)).pg_stripe_size), + }; if (pgs[wpg].state & PG_ACTIVE) { uint64_t & dest = this->unstable_writes[(osd_object_id_t){ diff --git a/pg_states.h b/pg_states.h index dc1d28a7..89d0d7df 100644 --- a/pg_states.h +++ b/pg_states.h @@ -21,7 +21,8 @@ #define PG_HAS_INVALID (1<<11) #define PG_LEFT_ON_DEAD (1<<12) -// FIXME: Safe default that doesn't depend on pg_stripe_size or pg_block_size +// Lower bits that represent object role (EC 0/1/2... or always 0 with replication) +// 12 bits is a safe default that doesn't depend on pg_stripe_size or pg_block_size #define STRIPE_MASK ((uint64_t)4096 - 1) // OSD object states