forked from vitalif/vitastor
Implement OSD tags (device classes), fix pool failure_domain configuration
parent
aa2a0ee00f
commit
94efb54feb
|
@ -21,7 +21,7 @@ breaking changes in the future. However, the following is implemented:
|
||||||
- Automatic data distribution over OSDs, with support for:
|
- Automatic data distribution over OSDs, with support for:
|
||||||
- Mathematical optimization for better uniformity and less data movement
|
- Mathematical optimization for better uniformity and less data movement
|
||||||
- Multiple pools
|
- Multiple pools
|
||||||
- Placement tree
|
- Placement tree, OSD selection by tags (device classes) and placement root
|
||||||
- Configurable failure domains
|
- Configurable failure domains
|
||||||
- Recovery of degraded blocks
|
- Recovery of degraded blocks
|
||||||
- Rebalancing (data movement between OSDs)
|
- Rebalancing (data movement between OSDs)
|
||||||
|
@ -46,6 +46,7 @@ breaking changes in the future. However, the following is implemented:
|
||||||
- Inode metadata storage in etcd
|
- Inode metadata storage in etcd
|
||||||
- Snapshots and copy-on-write image clones
|
- Snapshots and copy-on-write image clones
|
||||||
- Operation timeouts and better failure detection
|
- Operation timeouts and better failure detection
|
||||||
|
- Scrubbing without checksums (verification of replicas)
|
||||||
- Checksums
|
- Checksums
|
||||||
- SSD+HDD optimizations, possibly including tiered storage and soft journal flushes
|
- SSD+HDD optimizations, possibly including tiered storage and soft journal flushes
|
||||||
- RDMA and NVDIMM support
|
- RDMA and NVDIMM support
|
||||||
|
@ -80,7 +81,7 @@ Architectural differences from Ceph:
|
||||||
per drive you should run multiple OSDs each on a different partition of the drive.
|
per drive you should run multiple OSDs each on a different partition of the drive.
|
||||||
Vitastor isn't CPU-hungry though (as opposed to Ceph), so 1 core is sufficient in a lot of cases.
|
Vitastor isn't CPU-hungry though (as opposed to Ceph), so 1 core is sufficient in a lot of cases.
|
||||||
- Metadata and journal are always kept in memory. Metadata size depends linearly on drive capacity
|
- Metadata and journal are always kept in memory. Metadata size depends linearly on drive capacity
|
||||||
and data store block size which is 128 KB by default. With 128 KB blocks, metadata should occupy
|
and data store block size which is 128 KB by default. With 128 KB blocks metadata should occupy
|
||||||
around 512 MB per 1 TB (which is still less than Ceph wants). Journal doesn't have to be big,
|
around 512 MB per 1 TB (which is still less than Ceph wants). Journal doesn't have to be big,
|
||||||
the example test below was conducted with only 16 MB journal. A big journal is probably even
|
the example test below was conducted with only 16 MB journal. A big journal is probably even
|
||||||
harmful as dirty write metadata also take some memory.
|
harmful as dirty write metadata also take some memory.
|
||||||
|
|
65
mon/mon.js
65
mon/mon.js
|
@ -101,13 +101,15 @@ class Mon
|
||||||
failure_domain: 'host',
|
failure_domain: 'host',
|
||||||
max_osd_combinations: 10000,
|
max_osd_combinations: 10000,
|
||||||
pg_stripe_size: 4194304,
|
pg_stripe_size: 4194304,
|
||||||
// FIXME add device classes/tags
|
root_node?: 'rack1',
|
||||||
|
// restrict pool to OSDs having all of these tags
|
||||||
|
osd_tags?: 'nvme' | [ 'nvme', ... ],
|
||||||
},
|
},
|
||||||
...
|
...
|
||||||
}, */
|
}, */
|
||||||
pools: {},
|
pools: {},
|
||||||
osd: {
|
osd: {
|
||||||
/* <id>: { reweight: 1 }, ... */
|
/* <id>: { reweight?: 1, tags?: [ 'nvme', ... ] }, ... */
|
||||||
},
|
},
|
||||||
/* pgs: {
|
/* pgs: {
|
||||||
hash: string,
|
hash: string,
|
||||||
|
@ -466,7 +468,8 @@ class Mon
|
||||||
if (stat.size && (this.state.osd.state[osd_num] || Number(stat.time) >= down_time))
|
if (stat.size && (this.state.osd.state[osd_num] || Number(stat.time) >= down_time))
|
||||||
{
|
{
|
||||||
// Numeric IDs are reserved for OSDs
|
// Numeric IDs are reserved for OSDs
|
||||||
let reweight = this.state.config.osd[osd_num] && Number(this.state.config.osd[osd_num].reweight);
|
const osd_cfg = this.state.config.osd[osd_num];
|
||||||
|
let reweight = osd_cfg && Number(osd_cfg.reweight);
|
||||||
if (reweight < 0 || isNaN(reweight))
|
if (reweight < 0 || isNaN(reweight))
|
||||||
reweight = 1;
|
reweight = 1;
|
||||||
if (this.state.osd.state[osd_num] && reweight > 0)
|
if (this.state.osd.state[osd_num] && reweight > 0)
|
||||||
|
@ -477,6 +480,11 @@ class Mon
|
||||||
tree[osd_num] = tree[osd_num] || { id: osd_num, parent: stat.host };
|
tree[osd_num] = tree[osd_num] || { id: osd_num, parent: stat.host };
|
||||||
tree[osd_num].level = 'osd';
|
tree[osd_num].level = 'osd';
|
||||||
tree[osd_num].size = reweight * stat.size / 1024 / 1024 / 1024 / 1024; // terabytes
|
tree[osd_num].size = reweight * stat.size / 1024 / 1024 / 1024 / 1024; // terabytes
|
||||||
|
if (osd_cfg && osd_cfg.tags)
|
||||||
|
{
|
||||||
|
tree[osd_num].tags = (osd_cfg.tags instanceof Array ? [ ...osd_cfg.tags ] : [ osd_cfg.tags ])
|
||||||
|
.reduce((a, c) => { a[c] = true; return a; }, {});
|
||||||
|
}
|
||||||
delete tree[osd_num].children;
|
delete tree[osd_num].children;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -496,7 +504,7 @@ class Mon
|
||||||
tree[parent].children.push(tree[node_id]);
|
tree[parent].children.push(tree[node_id]);
|
||||||
delete node_cfg.parent;
|
delete node_cfg.parent;
|
||||||
}
|
}
|
||||||
return { up_osds, osd_tree: LPOptimizer.flatten_tree(tree[''].children, levels, this.config.failure_domain, 'osd') };
|
return { up_osds, levels, osd_tree: tree };
|
||||||
}
|
}
|
||||||
|
|
||||||
async stop_all_pgs(pool_id)
|
async stop_all_pgs(pool_id)
|
||||||
|
@ -663,15 +671,54 @@ class Mon
|
||||||
console.log('Pool '+pool_id+' has invalid max_osd_combinations (must be at least 100)');
|
console.log('Pool '+pool_id+' has invalid max_osd_combinations (must be at least 100)');
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
if (pool_cfg.root_node && typeof(pool_cfg.root_node) != 'string')
|
||||||
|
{
|
||||||
|
if (warn)
|
||||||
|
console.log('Pool '+pool_id+' has invalid root_node (must be a string)');
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (pool_cfg.osd_tags && typeof(pool_cfg.osd_tags) != 'string' &&
|
||||||
|
(!(pool_cfg.osd_tags instanceof Array) || pool_cfg.osd_tags.filter(t => typeof t != 'string').length > 0))
|
||||||
|
{
|
||||||
|
if (warn)
|
||||||
|
console.log('Pool '+pool_id+' has invalid osd_tags (must be a string or array of strings)');
|
||||||
|
return false;
|
||||||
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
filter_osds_by_tags(orig_tree, flat_tree, tags)
|
||||||
|
{
|
||||||
|
if (!tags)
|
||||||
|
{
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
for (const tag of (tags instanceof Array ? tags : [ tags ]))
|
||||||
|
{
|
||||||
|
for (const host in flat_tree)
|
||||||
|
{
|
||||||
|
let found = 0;
|
||||||
|
for (const osd in flat_tree[host])
|
||||||
|
{
|
||||||
|
if (!orig_tree[osd].tags || !orig_tree[osd].tags[tag])
|
||||||
|
delete flat_tree[host][osd];
|
||||||
|
else
|
||||||
|
found++;
|
||||||
|
}
|
||||||
|
if (!found)
|
||||||
|
{
|
||||||
|
delete flat_tree[host];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
async recheck_pgs()
|
async recheck_pgs()
|
||||||
{
|
{
|
||||||
// Take configuration and state, check it against the stored configuration hash
|
// Take configuration and state, check it against the stored configuration hash
|
||||||
// Recalculate PGs and save them to etcd if the configuration is changed
|
// Recalculate PGs and save them to etcd if the configuration is changed
|
||||||
// FIXME: Also do not change anything if the distribution is good enough and no PGs are degraded
|
// FIXME: Also do not change anything if the distribution is good enough and no PGs are degraded
|
||||||
const { up_osds, osd_tree } = this.get_osd_tree();
|
const { up_osds, levels, osd_tree } = this.get_osd_tree();
|
||||||
const tree_cfg = {
|
const tree_cfg = {
|
||||||
osd_tree,
|
osd_tree,
|
||||||
pools: this.state.config.pools,
|
pools: this.state.config.pools,
|
||||||
|
@ -688,6 +735,10 @@ class Mon
|
||||||
{
|
{
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
let pool_tree = osd_tree[pool_cfg.root_node || ''];
|
||||||
|
pool_tree = pool_tree ? pool_tree.children : [];
|
||||||
|
pool_tree = LPOptimizer.flatten_tree(pool_tree, levels, pool_cfg.failure_domain, 'osd');
|
||||||
|
pool_tree = this.filter_osds_by_tags(osd_tree, pool_tree, pool_cfg.osd_tags);
|
||||||
const prev_pgs = [];
|
const prev_pgs = [];
|
||||||
for (const pg in ((this.state.config.pgs.items||{})[pool_id]||{})||{})
|
for (const pg in ((this.state.config.pgs.items||{})[pool_id]||{})||{})
|
||||||
{
|
{
|
||||||
|
@ -710,7 +761,7 @@ class Mon
|
||||||
}
|
}
|
||||||
optimize_result = await LPOptimizer.optimize_change({
|
optimize_result = await LPOptimizer.optimize_change({
|
||||||
prev_pgs,
|
prev_pgs,
|
||||||
osd_tree: tree_cfg.osd_tree,
|
osd_tree: pool_tree,
|
||||||
pg_size: pool_cfg.pg_size,
|
pg_size: pool_cfg.pg_size,
|
||||||
pg_minsize: pool_cfg.pg_minsize,
|
pg_minsize: pool_cfg.pg_minsize,
|
||||||
max_combinations: pool_cfg.max_osd_combinations,
|
max_combinations: pool_cfg.max_osd_combinations,
|
||||||
|
@ -719,7 +770,7 @@ class Mon
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
optimize_result = await LPOptimizer.optimize_initial({
|
optimize_result = await LPOptimizer.optimize_initial({
|
||||||
osd_tree: tree_cfg.osd_tree,
|
osd_tree: pool_tree,
|
||||||
pg_count: pool_cfg.pg_count,
|
pg_count: pool_cfg.pg_count,
|
||||||
pg_size: pool_cfg.pg_size,
|
pg_size: pool_cfg.pg_size,
|
||||||
pg_minsize: pool_cfg.pg_minsize,
|
pg_minsize: pool_cfg.pg_minsize,
|
||||||
|
|
Loading…
Reference in New Issue