From ffe1cd4c796e308cfaa69ab84f46391cfb52e21d Mon Sep 17 00:00:00 2001 From: Vitaliy Filippov Date: Thu, 21 Jan 2021 00:30:18 +0300 Subject: [PATCH] Report inode I/O statistics, aggregate it in the monitor --- README.md | 2 +- mon/mon.js | 202 +++++++++++++++++++++---------------- src/msgr_op.h | 2 +- src/msgr_send.cpp | 10 +- src/osd.h | 12 +++ src/osd_cluster.cpp | 25 +++++ src/osd_primary_subops.cpp | 25 ++++- 7 files changed, 183 insertions(+), 95 deletions(-) diff --git a/README.md b/README.md index 6823286cd..e003f8bd3 100644 --- a/README.md +++ b/README.md @@ -34,12 +34,12 @@ breaking changes in the future. However, the following is implemented: - NBD proxy for kernel mounts - Inode removal tool (vitastor-rm) - Packaging for Debian and CentOS +- Per-inode I/O and space usage statistics ## Roadmap - OSD creation tool (OSDs currently have to be created by hand) - Other administrative tools -- Per-inode I/O and space usage statistics - Proxmox and OpenNebula plugins - iSCSI proxy - Inode metadata storage in etcd diff --git a/mon/mon.js b/mon/mon.js index 4f5ada393..0ca39d1c9 100644 --- a/mon/mon.js +++ b/mon/mon.js @@ -26,13 +26,14 @@ const etcd_allow = new RegExp('^'+[ 'config/pgs', 'osd/state/[1-9]\\d*', 'osd/stats/[1-9]\\d*', + 'osd/inodestats/[1-9]\\d*', 'osd/space/[1-9]\\d*', 'mon/master', 'pg/state/[1-9]\\d*/[1-9]\\d*', 'pg/stats/[1-9]\\d*/[1-9]\\d*', 'pg/history/[1-9]\\d*/[1-9]\\d*', 'history/last_clean_pgs', - 'inode/space/[1-9]\\d*', + 'inode/stats/[1-9]\\d*', 'stats', ].join('$|^')+'$'); @@ -174,6 +175,13 @@ const etcd_tree = { }, }, */ }, + inodestats: { + /* : { + read: { count: uint64_t, usec: uint64_t, bytes: uint64_t }, + write: { count: uint64_t, usec: uint64_t, bytes: uint64_t }, + delete: { count: uint64_t, usec: uint64_t, bytes: uint64_t }, + }, */ + }, space: { /* : { : uint64_t, // bytes @@ -219,9 +227,12 @@ const etcd_tree = { }, }, inode: { - space: { + stats: { /* : { - raw: uint64_t, // raw bytes on OSDs + raw_used: uint64_t, // raw used bytes on OSDs + read: { count: uint64_t, usec: uint64_t, bytes: uint64_t }, + write: { count: uint64_t, usec: uint64_t, bytes: uint64_t }, + delete: { count: uint64_t, usec: uint64_t, bytes: uint64_t }, }, */ }, }, @@ -409,7 +420,7 @@ class Mon { this.parse_kv(e.kv); const key = e.kv.key.substr(this.etcd_prefix.length); - if (key.substr(0, 11) == '/osd/stats/' || key.substr(0, 10) == '/pg/stats/') + if (key.substr(0, 11) == '/osd/stats/' || key.substr(0, 10) == '/pg/stats/' || key.substr(0, 16) == '/osd/inodestats/') { stats_changed = true; } @@ -417,7 +428,7 @@ class Mon { pg_states_changed = true; } - else if (key != '/stats' && key.substr(0, 13) != '/inode/space/') + else if (key != '/stats' && key.substr(0, 13) != '/inode/stats/') { changed = true; } @@ -1093,8 +1104,6 @@ class Mon sum_stats() { - let overflow = false; - this.prev_stats = this.prev_stats || { op_stats: {}, subop_stats: {}, recovery_stats: {} }; const op_stats = {}, subop_stats = {}, recovery_stats = {}; for (const osd in this.state.osd.stats) { @@ -1119,52 +1128,11 @@ class Mon recovery_stats[op].bytes += BigInt(st.recovery_stats[op].bytes||0); } } - for (const op in op_stats) - { - if (op_stats[op].count >= 0x10000000000000000n) - { - if (!this.prev_stats.op_stats[op]) - { - overflow = true; - } - else - { - op_stats[op].count -= this.prev_stats.op_stats[op].count; - op_stats[op].usec -= this.prev_stats.op_stats[op].usec; - op_stats[op].bytes -= this.prev_stats.op_stats[op].bytes; - } - } - } - for (const op in subop_stats) - { - if (subop_stats[op].count >= 0x10000000000000000n) - { - if (!this.prev_stats.subop_stats[op]) - { - overflow = true; - } - else - { - subop_stats[op].count -= this.prev_stats.subop_stats[op].count; - subop_stats[op].usec -= this.prev_stats.subop_stats[op].usec; - } - } - } - for (const op in recovery_stats) - { - if (recovery_stats[op].count >= 0x10000000000000000n) - { - if (!this.prev_stats.recovery_stats[op]) - { - overflow = true; - } - else - { - recovery_stats[op].count -= this.prev_stats.recovery_stats[op].count; - recovery_stats[op].bytes -= this.prev_stats.recovery_stats[op].bytes; - } - } - } + return { op_stats, subop_stats, recovery_stats }; + } + + sum_object_counts() + { const object_counts = { object: 0n, clean: 0n, misplaced: 0n, degraded: 0n, incomplete: 0n }; for (const pool_id in this.state.pg.stats) { @@ -1183,49 +1151,107 @@ class Mon } } } - return (this.prev_stats = { overflow, op_stats, subop_stats, recovery_stats, object_counts }); + return object_counts; + } + + sum_inode_stats() + { + const inode_stats = {}; + const inode_stub = () => ({ + raw_used: 0n, + read: { count: 0n, usec: 0n, bytes: 0n }, + write: { count: 0n, usec: 0n, bytes: 0n }, + delete: { count: 0n, usec: 0n, bytes: 0n }, + }); + for (const osd_num in this.state.osd.space) + { + for (const inode_num in this.state.osd.space[osd_num]) + { + inode_stats[inode_num] = inode_stats[inode_num] || inode_stub(); + inode_stats[inode_num].raw_used += BigInt(this.state.osd.space[osd_num][inode_num]||0); + } + } + for (const osd_num in this.state.osd.inodestats) + { + const ist = this.state.osd.inodestats[osd_num]; + for (const inode_num in ist) + { + inode_stats[inode_num] = inode_stats[inode_num] || inode_stub(); + for (const op of [ 'read', 'write', 'delete' ]) + { + inode_stats[inode_num][op].count += BigInt(ist[inode_num][op].count||0); + inode_stats[inode_num][op].usec += BigInt(ist[inode_num][op].usec||0); + inode_stats[inode_num][op].bytes += BigInt(ist[inode_num][op].bytes||0); + } + } + } + return inode_stats; + } + + fix_stat_overflows(obj, scratch) + { + for (const k in obj) + { + if (typeof obj[k] == 'bigint') + { + if (obj[k] >= 0x10000000000000000n) + { + if (scratch[k]) + { + for (const k2 in scratch) + { + obj[k2] -= scratch[k2]; + scratch[k2] = 0n; + } + } + else + { + for (const k2 in obj) + { + scratch[k2] = obj[k2]; + } + } + } + } + else if (typeof obj[k] == 'object') + { + this.fix_stat_overflows(obj[k], scratch[k] = (scratch[k] || {})); + } + } + } + + serialize_bigints(obj) + { + for (const k in obj) + { + if (typeof obj[k] == 'bigint') + { + obj[k] = ''+obj[k]; + } + else if (typeof obj[k] == 'object') + { + this.serialize_bigints(obj[k]); + } + } } async update_total_stats() { const txn = []; const stats = this.sum_stats(); - if (!stats.overflow) - { - // Convert to strings, serialize and save - const ser = {}; - for (const st of [ 'op_stats', 'subop_stats', 'recovery_stats' ]) - { - ser[st] = {}; - for (const op in stats[st]) - { - ser[st][op] = {}; - for (const k in stats[st][op]) - { - ser[st][op][k] = ''+stats[st][op][k]; - } - } - } - ser.object_counts = {}; - for (const k in stats.object_counts) - { - ser.object_counts[k] = ''+stats.object_counts[k]; - } - txn.push({ requestPut: { key: b64(this.etcd_prefix+'/stats'), value: b64(JSON.stringify(ser)) } }); - } - const space_stats = {}; - for (const osd_num in this.state.osd.space) - { - for (const inode_num in this.state.osd.space[osd_num]) - { - space_stats[inode_num] = (space_stats[inode_num] || BigInt(0)) + BigInt(this.state.osd.space[osd_num][inode_num]||0); - } - } - for (const inode_num in space_stats) + const object_counts = this.sum_object_counts(); + const inode_stats = this.sum_inode_stats(); + this.fix_stat_overflows(stats, (this.prev_stats = this.prev_stats || {})); + this.fix_stat_overflows(inode_stats, (this.prev_inode_stats = this.prev_inode_stats || {})); + stats.object_counts = object_counts; + this.serialize_bigints(stats); + this.serialize_bigints(inode_stats); + txn.push({ requestPut: { key: b64(this.etcd_prefix+'/stats'), value: b64(JSON.stringify(stats)) } }); + for (const inode_num in inode_stats) { txn.push({ requestPut: { - key: b64(this.etcd_prefix+'/inode/space/'+inode_num), - value: b64(JSON.stringify({ raw: ''+space_stats[inode_num] })), + key: b64(this.etcd_prefix+'/inode/stats/'+inode_num), + value: b64(JSON.stringify(inode_stats[inode_num])), } }); } if (txn.length) diff --git a/src/msgr_op.h b/src/msgr_op.h index 18667e040..237bb10f1 100644 --- a/src/msgr_op.h +++ b/src/msgr_op.h @@ -154,7 +154,7 @@ struct osd_primary_op_data_t; struct osd_op_t { - timespec tv_begin; + timespec tv_begin = { 0 }, tv_end = { 0 }; uint64_t op_type = OSD_OP_IN; int peer_fd; osd_any_op_t req; diff --git a/src/msgr_send.cpp b/src/msgr_send.cpp index d5c8090da..5fd0539ff 100644 --- a/src/msgr_send.cpp +++ b/src/msgr_send.cpp @@ -109,8 +109,10 @@ void osd_messenger_t::measure_exec(osd_op_t *cur_op) { return; } - timespec tv_end; - clock_gettime(CLOCK_REALTIME, &tv_end); + if (!cur_op->tv_end.tv_sec) + { + clock_gettime(CLOCK_REALTIME, &cur_op->tv_end); + } stats.op_stat_count[cur_op->req.hdr.opcode]++; if (!stats.op_stat_count[cur_op->req.hdr.opcode]) { @@ -119,8 +121,8 @@ void osd_messenger_t::measure_exec(osd_op_t *cur_op) stats.op_stat_bytes[cur_op->req.hdr.opcode] = 0; } stats.op_stat_sum[cur_op->req.hdr.opcode] += ( - (tv_end.tv_sec - cur_op->tv_begin.tv_sec)*1000000 + - (tv_end.tv_nsec - cur_op->tv_begin.tv_nsec)/1000 + (cur_op->tv_end.tv_sec - cur_op->tv_begin.tv_sec)*1000000 + + (cur_op->tv_end.tv_nsec - cur_op->tv_begin.tv_nsec)/1000 ); if (cur_op->req.hdr.opcode == OSD_OP_READ || cur_op->req.hdr.opcode == OSD_OP_WRITE) diff --git a/src/osd.h b/src/osd.h index bef90f595..35c7fdd82 100644 --- a/src/osd.h +++ b/src/osd.h @@ -55,6 +55,17 @@ struct osd_recovery_op_t osd_op_t *osd_op = NULL; }; +// Posted as /osd/inodestats/$osd, then accumulated by the monitor +#define INODE_STATS_READ 0 +#define INODE_STATS_WRITE 1 +#define INODE_STATS_DELETE 2 +struct inode_stats_t +{ + uint64_t op_sum[3] = { 0 }; + uint64_t op_count[3] = { 0 }; + uint64_t op_bytes[3] = { 0 }; +}; + class osd_t { // config @@ -126,6 +137,7 @@ class osd_t // op statistics osd_op_stats_t prev_stats; + std::map inode_stats; const char* recovery_stat_names[2] = { "degraded", "misplaced" }; uint64_t recovery_stat_count[2][2] = { 0 }; uint64_t recovery_stat_bytes[2][2] = { 0 }; diff --git a/src/osd_cluster.cpp b/src/osd_cluster.cpp index 12c59ee97..ec8c0140d 100644 --- a/src/osd_cluster.cpp +++ b/src/osd_cluster.cpp @@ -187,6 +187,27 @@ void osd_t::report_statistics() { inode_space[std::to_string(kv.first)] = kv.second; } + json11::Json::object inode_ops; + for (auto kv: inode_stats) + { + inode_ops[std::to_string(kv.first)] = json11::Json::object { + { "read", json11::Json::object { + { "count", kv.second.op_count[INODE_STATS_READ] }, + { "usec", kv.second.op_sum[INODE_STATS_READ] }, + { "bytes", kv.second.op_bytes[INODE_STATS_READ] }, + } }, + { "write", json11::Json::object { + { "count", kv.second.op_count[INODE_STATS_WRITE] }, + { "usec", kv.second.op_sum[INODE_STATS_WRITE] }, + { "bytes", kv.second.op_bytes[INODE_STATS_WRITE] }, + } }, + { "delete", json11::Json::object { + { "count", kv.second.op_count[INODE_STATS_DELETE] }, + { "usec", kv.second.op_sum[INODE_STATS_DELETE] }, + { "bytes", kv.second.op_bytes[INODE_STATS_DELETE] }, + } }, + }; + } json11::Json::array txn = { json11::Json::object { { "request_put", json11::Json::object { { "key", base64_encode(st_cli.etcd_prefix+"/osd/stats/"+std::to_string(osd_num)) }, @@ -196,6 +217,10 @@ void osd_t::report_statistics() { "key", base64_encode(st_cli.etcd_prefix+"/osd/space/"+std::to_string(osd_num)) }, { "value", base64_encode(json11::Json(inode_space).dump()) }, } }, + { "request_put", json11::Json::object { + { "key", base64_encode(st_cli.etcd_prefix+"/osd/inodestats/"+std::to_string(osd_num)) }, + { "value", base64_encode(json11::Json(inode_ops).dump()) }, + } }, } }; for (auto & p: pgs) { diff --git a/src/osd_primary_subops.cpp b/src/osd_primary_subops.cpp index 8fd034381..ec8209d61 100644 --- a/src/osd_primary_subops.cpp +++ b/src/osd_primary_subops.cpp @@ -36,6 +36,29 @@ void osd_t::autosync() void osd_t::finish_op(osd_op_t *cur_op, int retval) { inflight_ops--; + if (cur_op->req.hdr.opcode == OSD_OP_READ || + cur_op->req.hdr.opcode == OSD_OP_WRITE || + cur_op->req.hdr.opcode == OSD_OP_DELETE) + { + // Track inode statistics + if (!cur_op->tv_end.tv_sec) + { + clock_gettime(CLOCK_REALTIME, &cur_op->tv_end); + } + uint64_t usec = ( + (cur_op->tv_end.tv_sec - cur_op->tv_begin.tv_sec)*1000000 + + (cur_op->tv_end.tv_nsec - cur_op->tv_begin.tv_nsec)/1000 + ); + int inode_st_op = cur_op->req.hdr.opcode == OSD_OP_DELETE + ? INODE_STATS_DELETE + : (cur_op->req.hdr.opcode == OSD_OP_READ ? INODE_STATS_READ : INODE_STATS_WRITE); + inode_stats[cur_op->req.rw.inode].op_count[inode_st_op]++; + inode_stats[cur_op->req.rw.inode].op_sum[inode_st_op] += usec; + if (cur_op->req.hdr.opcode == OSD_OP_DELETE) + inode_stats[cur_op->req.rw.inode].op_bytes[inode_st_op] += cur_op->op_data->pg_data_size * bs_block_size; + else + inode_stats[cur_op->req.rw.inode].op_bytes[inode_st_op] += cur_op->req.rw.len; + } if (cur_op->op_data) { if (cur_op->op_data->pg_num > 0) @@ -66,7 +89,7 @@ void osd_t::finish_op(osd_op_t *cur_op, int retval) } else { - // FIXME add separate magic number + // FIXME add separate magic number for primary ops auto cl_it = c_cli.clients.find(cur_op->peer_fd); if (cl_it != c_cli.clients.end()) {