diff --git a/README.md b/README.md index 69855e08..c13718f1 100644 --- a/README.md +++ b/README.md @@ -21,7 +21,7 @@ breaking changes in the future. However, the following is implemented: - Automatic data distribution over OSDs, with support for: - Mathematical optimization for better uniformity and less data movement - Multiple pools - - Placement tree + - Placement tree, OSD selection by tags (device classes) and placement root - Configurable failure domains - Recovery of degraded blocks - Rebalancing (data movement between OSDs) @@ -46,6 +46,7 @@ breaking changes in the future. However, the following is implemented: - Inode metadata storage in etcd - Snapshots and copy-on-write image clones - Operation timeouts and better failure detection +- Scrubbing without checksums (verification of replicas) - Checksums - SSD+HDD optimizations, possibly including tiered storage and soft journal flushes - RDMA and NVDIMM support @@ -80,7 +81,7 @@ Architectural differences from Ceph: per drive you should run multiple OSDs each on a different partition of the drive. Vitastor isn't CPU-hungry though (as opposed to Ceph), so 1 core is sufficient in a lot of cases. - Metadata and journal are always kept in memory. Metadata size depends linearly on drive capacity - and data store block size which is 128 KB by default. With 128 KB blocks, metadata should occupy + and data store block size which is 128 KB by default. With 128 KB blocks metadata should occupy around 512 MB per 1 TB (which is still less than Ceph wants). Journal doesn't have to be big, the example test below was conducted with only 16 MB journal. A big journal is probably even harmful as dirty write metadata also take some memory. diff --git a/mon/mon.js b/mon/mon.js index 286674ef..f7984715 100644 --- a/mon/mon.js +++ b/mon/mon.js @@ -101,13 +101,15 @@ class Mon failure_domain: 'host', max_osd_combinations: 10000, pg_stripe_size: 4194304, - // FIXME add device classes/tags + root_node?: 'rack1', + // restrict pool to OSDs having all of these tags + osd_tags?: 'nvme' | [ 'nvme', ... ], }, ... }, */ pools: {}, osd: { - /* : { reweight: 1 }, ... */ + /* : { reweight?: 1, tags?: [ 'nvme', ... ] }, ... */ }, /* pgs: { hash: string, @@ -466,7 +468,8 @@ class Mon if (stat.size && (this.state.osd.state[osd_num] || Number(stat.time) >= down_time)) { // Numeric IDs are reserved for OSDs - let reweight = this.state.config.osd[osd_num] && Number(this.state.config.osd[osd_num].reweight); + const osd_cfg = this.state.config.osd[osd_num]; + let reweight = osd_cfg && Number(osd_cfg.reweight); if (reweight < 0 || isNaN(reweight)) reweight = 1; if (this.state.osd.state[osd_num] && reweight > 0) @@ -477,6 +480,11 @@ class Mon tree[osd_num] = tree[osd_num] || { id: osd_num, parent: stat.host }; tree[osd_num].level = 'osd'; tree[osd_num].size = reweight * stat.size / 1024 / 1024 / 1024 / 1024; // terabytes + if (osd_cfg && osd_cfg.tags) + { + tree[osd_num].tags = (osd_cfg.tags instanceof Array ? [ ...osd_cfg.tags ] : [ osd_cfg.tags ]) + .reduce((a, c) => { a[c] = true; return a; }, {}); + } delete tree[osd_num].children; } } @@ -496,7 +504,7 @@ class Mon tree[parent].children.push(tree[node_id]); delete node_cfg.parent; } - return { up_osds, osd_tree: LPOptimizer.flatten_tree(tree[''].children, levels, this.config.failure_domain, 'osd') }; + return { up_osds, levels, osd_tree: tree }; } async stop_all_pgs(pool_id) @@ -663,15 +671,54 @@ class Mon console.log('Pool '+pool_id+' has invalid max_osd_combinations (must be at least 100)'); return false; } + if (pool_cfg.root_node && typeof(pool_cfg.root_node) != 'string') + { + if (warn) + console.log('Pool '+pool_id+' has invalid root_node (must be a string)'); + return false; + } + if (pool_cfg.osd_tags && typeof(pool_cfg.osd_tags) != 'string' && + (!(pool_cfg.osd_tags instanceof Array) || pool_cfg.osd_tags.filter(t => typeof t != 'string').length > 0)) + { + if (warn) + console.log('Pool '+pool_id+' has invalid osd_tags (must be a string or array of strings)'); + return false; + } return true; } + filter_osds_by_tags(orig_tree, flat_tree, tags) + { + if (!tags) + { + return false; + } + for (const tag of (tags instanceof Array ? tags : [ tags ])) + { + for (const host in flat_tree) + { + let found = 0; + for (const osd in flat_tree[host]) + { + if (!orig_tree[osd].tags || !orig_tree[osd].tags[tag]) + delete flat_tree[host][osd]; + else + found++; + } + if (!found) + { + delete flat_tree[host]; + } + } + } + } + async recheck_pgs() { // Take configuration and state, check it against the stored configuration hash // Recalculate PGs and save them to etcd if the configuration is changed // FIXME: Also do not change anything if the distribution is good enough and no PGs are degraded - const { up_osds, osd_tree } = this.get_osd_tree(); + const { up_osds, levels, osd_tree } = this.get_osd_tree(); const tree_cfg = { osd_tree, pools: this.state.config.pools, @@ -688,6 +735,10 @@ class Mon { continue; } + let pool_tree = osd_tree[pool_cfg.root_node || '']; + pool_tree = pool_tree ? pool_tree.children : []; + pool_tree = LPOptimizer.flatten_tree(pool_tree, levels, pool_cfg.failure_domain, 'osd'); + pool_tree = this.filter_osds_by_tags(osd_tree, pool_tree, pool_cfg.osd_tags); const prev_pgs = []; for (const pg in ((this.state.config.pgs.items||{})[pool_id]||{})||{}) { @@ -710,7 +761,7 @@ class Mon } optimize_result = await LPOptimizer.optimize_change({ prev_pgs, - osd_tree: tree_cfg.osd_tree, + osd_tree: pool_tree, pg_size: pool_cfg.pg_size, pg_minsize: pool_cfg.pg_minsize, max_combinations: pool_cfg.max_osd_combinations, @@ -719,7 +770,7 @@ class Mon else { optimize_result = await LPOptimizer.optimize_initial({ - osd_tree: tree_cfg.osd_tree, + osd_tree: pool_tree, pg_count: pool_cfg.pg_count, pg_size: pool_cfg.pg_size, pg_minsize: pool_cfg.pg_minsize,