Make pg_stripe_size a per-pool config

Vitaliy Filippov 2020-10-01 18:51:49 +03:00
förälder ba74eece4a
incheckning 9f2a948712
10 ändrade filer med 22 tillägg och 29 borttagningar

Visa fil

@ -160,15 +160,6 @@ void cluster_client_t::on_load_config_hook(json11::Json::object & config)
{
throw std::runtime_error("Bad block size");
}
// FIXME: pg_stripe_size may be a per-pool config
if (config.find("pg_stripe_size") != config.end())
{
pg_stripe_size = config["pg_stripe_size"].uint64_value();
}
if (!pg_stripe_size)
{
pg_stripe_size = DEFAULT_PG_STRIPE_SIZE;
}
if (config["immediate_commit"] == "all")
{
// Cluster-wide immediate_commit mode
@ -473,7 +464,7 @@ void cluster_client_t::slice_rw(cluster_op_t *op)
int i = 0;
for (uint64_t stripe = first_stripe; stripe <= last_stripe; stripe += pg_block_size)
{
pg_num_t pg_num = (op->inode + stripe/pg_stripe_size) % pool_cfg.real_pg_count + 1;
pg_num_t pg_num = (op->inode + stripe/pool_cfg.pg_stripe_size) % pool_cfg.real_pg_count + 1;
uint64_t begin = (op->offset < stripe ? stripe : op->offset);
uint64_t end = (op->offset + op->len) > (stripe + pg_block_size)
? (stripe + pg_block_size) : (op->offset + op->len);

Visa fil

@ -9,7 +9,6 @@
#define MIN_BLOCK_SIZE 4*1024
#define MAX_BLOCK_SIZE 128*1024*1024
#define DEFAULT_BLOCK_SIZE 128*1024
#define DEFAULT_PG_STRIPE_SIZE 4*1024*1024
#define DEFAULT_DISK_ALIGNMENT 4096
#define DEFAULT_BITMAP_GRANULARITY 4096
#define DEFAULT_CLIENT_DIRTY_LIMIT 32*1024*1024
@ -54,7 +53,6 @@ class cluster_client_t
timerfd_manager_t *tfd;
ring_loop_t *ringloop;
uint64_t pg_stripe_size = 0;
uint64_t bs_block_size = 0;
uint64_t bs_disk_alignment = 0;
uint64_t bs_bitmap_granularity = 0;

Visa fil

@ -358,6 +358,11 @@ void etcd_state_client_t::parse_state(const std::string & key, const json11::Jso
parsed_cfg.pg_minsize = pool_item.second["pg_minsize"].uint64_value();
parsed_cfg.pg_count = pool_item.second["pg_count"].uint64_value();
parsed_cfg.failure_domain = pool_item.second["failure_domain"].string_value();
parsed_cfg.pg_stripe_size = pool_item.second["pg_stripe_size"].uint64_value();
if (!parsed_cfg.pg_stripe_size)
{
parsed_cfg.pg_stripe_size = DEFAULT_PG_STRIPE_SIZE;
}
parsed_cfg.max_osd_combinations = pool_item.second["max_osd_combinations"].uint64_value();
if (!parsed_cfg.max_osd_combinations)
{

Visa fil

@ -16,6 +16,8 @@
#define ETCD_SLOW_TIMEOUT 5000
#define ETCD_QUICK_TIMEOUT 1000
#define DEFAULT_PG_STRIPE_SIZE 4*1024*1024
struct json_kv_t
{
std::string key;
@ -46,6 +48,7 @@ struct pool_config_t
uint64_t real_pg_count;
std::string failure_domain;
uint64_t max_osd_combinations;
uint64_t pg_stripe_size;
std::map<pg_num_t, pg_config_t> pg_config;
};

Visa fil

@ -45,7 +45,6 @@ class Mon
block_size: 131072,
disk_alignment: 4096,
bitmap_granularity: 4096,
pg_stripe_size: 4194304,
immediate_commit: false, // 'all' or 'small'
client_dirty_limit: 33554432,
peer_connect_interval: 5, // seconds. min: 1
@ -101,6 +100,7 @@ class Mon
pg_count: 100,
failure_domain: 'host',
max_osd_combinations: 10000,
pg_stripe_size: 4194304,
// FIXME add device classes/tags
},
...

Visa fil

@ -83,12 +83,6 @@ void osd_t::parse_config(blockstore_config_t & config)
if (client_queue_depth < 128)
client_queue_depth = 128;
}
if (config.find("pg_stripe_size") != config.end())
{
pg_stripe_size = strtoull(config["pg_stripe_size"].c_str(), NULL, 10);
if (!pg_stripe_size || !bs_block_size || pg_stripe_size < bs_block_size || (pg_stripe_size % bs_block_size) != 0)
pg_stripe_size = DEFAULT_PG_STRIPE_SIZE;
}
recovery_queue_depth = strtoull(config["recovery_queue_depth"].c_str(), NULL, 10);
if (recovery_queue_depth < 1 || recovery_queue_depth > MAX_RECOVERY_QUEUE)
recovery_queue_depth = DEFAULT_RECOVERY_QUEUE;

4
osd.h
Visa fil

@ -37,7 +37,6 @@
#define DEFAULT_AUTOSYNC_INTERVAL 5
#define MAX_RECOVERY_QUEUE 2048
#define DEFAULT_RECOVERY_QUEUE 4
#define DEFAULT_PG_STRIPE_SIZE 4*1024*1024 // 4 MB by default
//#define OSD_STUB
@ -110,7 +109,6 @@ class osd_t
int inflight_ops = 0;
blockstore_t *bs;
uint32_t bs_block_size, bs_disk_alignment;
uint64_t pg_stripe_size = DEFAULT_PG_STRIPE_SIZE;
ring_loop_t *ringloop;
timerfd_manager_t *tfd = NULL;
epoll_manager_t *epmgr = NULL;
@ -201,7 +199,7 @@ class osd_t
void submit_primary_sync_subops(osd_op_t *cur_op);
void submit_primary_stab_subops(osd_op_t *cur_op);
inline pg_num_t map_to_pg(object_id oid)
inline pg_num_t map_to_pg(object_id oid, uint64_t pg_stripe_size)
{
uint64_t pg_count = pg_counts[INODE_POOL(oid.inode)];
if (!pg_count)

Visa fil

@ -125,10 +125,11 @@ void osd_t::start_pg_peering(pg_t & pg)
cancel_primary_write(p.second);
}
pg.write_queue.clear();
uint64_t pg_stripe_size = st_cli.pool_config[pg.pool_id].pg_stripe_size;
for (auto it = unstable_writes.begin(); it != unstable_writes.end(); )
{
// Forget this PG's unstable writes
if (INODE_POOL(it->first.oid.inode) == pg.pool_id && map_to_pg(it->first.oid) == pg.pg_num)
if (INODE_POOL(it->first.oid.inode) == pg.pool_id && map_to_pg(it->first.oid, pg_stripe_size) == pg.pg_num)
unstable_writes.erase(it++);
else
it++;
@ -348,7 +349,7 @@ void osd_t::submit_list_subop(osd_num_t role_osd, pg_peering_state_t *ps)
clock_gettime(CLOCK_REALTIME, &op->tv_begin);
op->bs_op = new blockstore_op_t();
op->bs_op->opcode = BS_OP_LIST;
op->bs_op->oid.stripe = pg_stripe_size;
op->bs_op->oid.stripe = st_cli.pool_config[ps->pool_id].pg_stripe_size;
op->bs_op->oid.inode = ((uint64_t)ps->pool_id << (64 - POOL_ID_BITS));
op->bs_op->version = ((uint64_t)(ps->pool_id+1) << (64 - POOL_ID_BITS)) - 1;
op->bs_op->len = pg_counts[ps->pool_id];
@ -392,7 +393,7 @@ void osd_t::submit_list_subop(osd_num_t role_osd, pg_peering_state_t *ps)
},
.list_pg = ps->pg_num,
.pg_count = pg_counts[ps->pool_id],
.pg_stripe_size = pg_stripe_size,
.pg_stripe_size = st_cli.pool_config[ps->pool_id].pg_stripe_size,
.min_inode = ((uint64_t)(ps->pool_id) << (64 - POOL_ID_BITS)),
.max_inode = ((uint64_t)(ps->pool_id+1) << (64 - POOL_ID_BITS)) - 1,
},

Visa fil

@ -25,8 +25,7 @@ bool osd_t::prepare_primary_rw(osd_op_t *cur_op)
// oid.stripe = starting offset of the parity stripe
.stripe = (cur_op->req.rw.offset/pg_block_size)*pg_block_size,
};
// FIXME: pg_stripe_size may be a per-pool config
pg_num_t pg_num = (cur_op->req.rw.inode + oid.stripe/pg_stripe_size) % pg_counts[pool_id] + 1;
pg_num_t pg_num = (cur_op->req.rw.inode + oid.stripe/pool_cfg.pg_stripe_size) % pg_counts[pool_id] + 1;
auto pg_it = pgs.find({ .pool_id = pool_id, .pg_num = pg_num });
if (pg_it == pgs.end() || !(pg_it->second.state & PG_ACTIVE))
{
@ -604,7 +603,10 @@ resume_6:
{
// Except those from peered PGs
auto & w = op_data->unstable_writes[i];
pool_pg_num_t wpg = { .pool_id = INODE_POOL(w.oid.inode), .pg_num = map_to_pg(w.oid) };
pool_pg_num_t wpg = {
.pool_id = INODE_POOL(w.oid.inode),
.pg_num = map_to_pg(w.oid, st_cli.pool_config.at(INODE_POOL(w.oid.inode)).pg_stripe_size),
};
if (pgs[wpg].state & PG_ACTIVE)
{
uint64_t & dest = this->unstable_writes[(osd_object_id_t){

Visa fil

@ -21,7 +21,8 @@
#define PG_HAS_INVALID (1<<11)
#define PG_LEFT_ON_DEAD (1<<12)
// FIXME: Safe default that doesn't depend on pg_stripe_size or pg_block_size
// Lower bits that represent object role (EC 0/1/2... or always 0 with replication)
// 12 bits is a safe default that doesn't depend on pg_stripe_size or pg_block_size
#define STRIPE_MASK ((uint64_t)4096 - 1)
// OSD object states