diff --git a/mon/mon.js b/mon/mon.js index 69b7c20a..a37d46e2 100644 --- a/mon/mon.js +++ b/mon/mon.js @@ -160,6 +160,8 @@ const etcd_tree = { root_node?: 'rack1', // restrict pool to OSDs having all of these tags osd_tags?: 'nvme' | [ 'nvme', ... ], + // prefer to put primary on OSD with these tags + primary_affinity_tags?: 'nvme' | [ 'nvme', ... ], }, ... }, */ @@ -903,27 +905,39 @@ class Mon return this.seed + 2147483648; } - pick_primary(pool_id, osd_set, up_osds) + pick_primary(pool_id, osd_set, up_osds, aff_osds) { let alive_set; if (this.state.config.pools[pool_id].scheme === 'replicated') - alive_set = osd_set.filter(osd_num => osd_num && up_osds[osd_num]); + { + // Prefer "affinity" OSDs + alive_set = osd_set.filter(osd_num => osd_num && aff_osds[osd_num]); + if (!alive_set.length) + alive_set = osd_set.filter(osd_num => osd_num && up_osds[osd_num]); + } else { // Prefer data OSDs for EC because they can actually read something without an additional network hop const pg_data_size = (this.state.config.pools[pool_id].pg_size||0) - (this.state.config.pools[pool_id].parity_chunks||0); - alive_set = osd_set.slice(0, pg_data_size).filter(osd_num => osd_num && up_osds[osd_num]); + alive_set = osd_set.slice(0, pg_data_size).filter(osd_num => osd_num && aff_osds[osd_num]); if (!alive_set.length) - alive_set = osd_set.filter(osd_num => osd_num && up_osds[osd_num]); + alive_set = osd_set.filter(osd_num => osd_num && aff_osds[osd_num]); + if (!alive_set.length) + { + alive_set = osd_set.slice(0, pg_data_size).filter(osd_num => osd_num && up_osds[osd_num]); + if (!alive_set.length) + alive_set = osd_set.filter(osd_num => osd_num && up_osds[osd_num]); + } } if (!alive_set.length) return 0; return alive_set[this.rng() % alive_set.length]; } - save_new_pgs_txn(request, pool_id, up_osds, prev_pgs, new_pgs, pg_history) + save_new_pgs_txn(request, pool_id, up_osds, osd_tree, prev_pgs, new_pgs, pg_history) { + const aff_osds = this.get_affinity_osds(this.state.config.pools[pool_id], up_osds, osd_tree); const pg_items = {}; this.reset_rng(); new_pgs.map((osd_set, i) => @@ -931,7 +945,7 @@ class Mon osd_set = osd_set.map(osd_num => osd_num === LPOptimizer.NO_OSD ? 0 : osd_num); pg_items[i+1] = { osd_set, - primary: this.pick_primary(pool_id, osd_set, up_osds), + primary: this.pick_primary(pool_id, osd_set, up_osds, aff_osds), }; if (prev_pgs[i] && prev_pgs[i].join(' ') != osd_set.join(' ') && prev_pgs[i].filter(osd_num => osd_num).length > 0) @@ -1062,6 +1076,13 @@ class Mon console.log('Pool '+pool_id+' has invalid osd_tags (must be a string or array of strings)'); return false; } + if (pool_cfg.primary_affinity_tags && typeof(pool_cfg.primary_affinity_tags) != 'string' && + (!(pool_cfg.primary_affinity_tags instanceof Array) || pool_cfg.primary_affinity_tags.filter(t => typeof t != 'string').length > 0)) + { + if (warn) + console.log('Pool '+pool_id+' has invalid primary_affinity_tags (must be a string or array of strings)'); + return false; + } return true; } @@ -1091,6 +1112,17 @@ class Mon } } + get_affinity_osds(pool_cfg, up_osds, osd_tree) + { + let aff_osds = up_osds; + if (pool_cfg.primary_affinity_tags) + { + aff_osds = { ...up_osds }; + this.filter_osds_by_tags(osd_tree, { x: aff_osds }, pool_cfg.primary_affinity_tags); + } + return aff_osds; + } + async recheck_pgs() { // Take configuration and state, check it against the stored configuration hash @@ -1121,7 +1153,7 @@ class Mon { prev_pgs[pg-1] = this.state.config.pgs.items[pool_id][pg].osd_set; } - this.save_new_pgs_txn(etcd_request, pool_id, up_osds, prev_pgs, [], []); + this.save_new_pgs_txn(etcd_request, pool_id, up_osds, osd_tree, prev_pgs, [], []); } } for (const pool_id in this.state.config.pools) @@ -1228,7 +1260,7 @@ class Mon key: b64(this.etcd_prefix+'/pool/stats/'+pool_id), value: b64(JSON.stringify(this.state.pool.stats[pool_id])), } }); - this.save_new_pgs_txn(etcd_request, pool_id, up_osds, real_prev_pgs, optimize_result.int_pgs, pg_history); + this.save_new_pgs_txn(etcd_request, pool_id, up_osds, osd_tree, real_prev_pgs, optimize_result.int_pgs, pg_history); } this.state.config.pgs.hash = tree_hash; await this.save_pg_config(etcd_request); @@ -1245,13 +1277,14 @@ class Mon continue; } const replicated = pool_cfg.scheme === 'replicated'; + const aff_osds = this.get_affinity_osds(pool_cfg, up_osds, osd_tree); this.reset_rng(); for (let pg_num = 1; pg_num <= pool_cfg.pg_count; pg_num++) { const pg_cfg = this.state.config.pgs.items[pool_id][pg_num]; if (pg_cfg) { - const new_primary = this.pick_primary(pool_id, pg_cfg.osd_set, up_osds); + const new_primary = this.pick_primary(pool_id, pg_cfg.osd_set, up_osds, aff_osds); if (pg_cfg.primary != new_primary) { console.log(