Browse Source

Report inode I/O statistics, aggregate it in the monitor

undefined
Vitaliy Filippov 1 month ago
parent
commit
de767d38f4
7 changed files with 171 additions and 83 deletions
  1. +1
    -1
      README.md
  2. +1
    -1
      messenger.h
  3. +102
    -76
      mon/mon.js
  4. +6
    -4
      msgr_send.cpp
  5. +12
    -0
      osd.h
  6. +25
    -0
      osd_cluster.cpp
  7. +24
    -1
      osd_primary_subops.cpp

+ 1
- 1
README.md View File

@@ -34,12 +34,12 @@ breaking changes in the future. However, the following is implemented:
- NBD proxy for kernel mounts
- Inode removal tool (vitastor-rm)
- Packaging for Debian and CentOS
- Per-inode I/O and space usage statistics

## Roadmap

- OSD creation tool (OSDs currently have to be created by hand)
- Other administrative tools
- Per-inode I/O and space usage statistics
- Proxmox and OpenNebula plugins
- iSCSI proxy
- Inode metadata storage in etcd


+ 1
- 1
messenger.h View File

@@ -174,7 +174,7 @@ struct osd_primary_op_data_t;

struct osd_op_t
{
timespec tv_begin;
timespec tv_begin = { 0 }, tv_end = { 0 };
uint64_t op_type = OSD_OP_IN;
int peer_fd;
osd_any_op_t req;


+ 102
- 76
mon/mon.js View File

@@ -18,12 +18,13 @@ const etcd_allow = new RegExp('^'+[
'config/pgs',
'osd/state/[1-9]\\d*',
'osd/stats/[1-9]\\d*',
'osd/inodestats/[1-9]\\d*',
'osd/space/[1-9]\\d*',
'mon/master',
'pg/state/[1-9]\\d*/[1-9]\\d*',
'pg/stats/[1-9]\\d*/[1-9]\\d*',
'pg/history/[1-9]\\d*/[1-9]\\d*',
'inode/space/[1-9]\\d*',
'inode/stats/[1-9]\\d*',
'stats',
].join('$|^')+'$');

@@ -157,6 +158,13 @@ const etcd_tree = {
},
}, */
},
inodestats: {
/* <inode_t>: {
read: { count: uint64_t, usec: uint64_t, bytes: uint64_t },
write: { count: uint64_t, usec: uint64_t, bytes: uint64_t },
delete: { count: uint64_t, usec: uint64_t, bytes: uint64_t },
}, */
},
space: {
/* <osd_num_t>: {
<inode_t>: uint64_t, // bytes
@@ -202,9 +210,12 @@ const etcd_tree = {
},
},
inode: {
space: {
stats: {
/* <inode_t>: {
raw: uint64_t, // raw bytes on OSDs
raw_used: uint64_t, // raw used bytes on OSDs
read: { count: uint64_t, usec: uint64_t, bytes: uint64_t },
write: { count: uint64_t, usec: uint64_t, bytes: uint64_t },
delete: { count: uint64_t, usec: uint64_t, bytes: uint64_t },
}, */
},
},
@@ -380,11 +391,11 @@ class Mon
{
this.parse_kv(e.kv);
const key = e.kv.key.substr(this.etcd_prefix.length);
if (key.substr(0, 11) == '/osd/stats/' || key.substr(0, 10) == '/pg/stats/')
if (key.substr(0, 11) == '/osd/stats/' || key.substr(0, 10) == '/pg/stats/' || key.substr(0, 16) == '/osd/inodestats/')
{
stats_changed = true;
}
else if (key != '/stats' && key.substr(0, 13) != '/inode/space/')
else if (key != '/stats' && key.substr(0, 13) != '/inode/stats/')
{
changed = true;
}
@@ -965,8 +976,6 @@ class Mon

sum_stats()
{
let overflow = false;
this.prev_stats = this.prev_stats || { op_stats: {}, subop_stats: {}, recovery_stats: {} };
const op_stats = {}, subop_stats = {}, recovery_stats = {};
for (const osd in this.state.osd.stats)
{
@@ -991,110 +1000,127 @@ class Mon
recovery_stats[op].bytes += BigInt(st.recovery_stats.bytes||0);
}
}
for (const op in op_stats)
{
if (op_stats[op].count >= 0x10000000000000000n)
{
if (!this.prev_stats.op_stats[op])
{
overflow = true;
}
else
{
op_stats[op].count -= this.prev_stats.op_stats[op].count;
op_stats[op].usec -= this.prev_stats.op_stats[op].usec;
op_stats[op].bytes -= this.prev_stats.op_stats[op].bytes;
}
}
}
for (const op in subop_stats)
return { op_stats, subop_stats, recovery_stats };
}

sum_object_counts()
{
const object_counts = { object: 0n, clean: 0n, misplaced: 0n, degraded: 0n, incomplete: 0n };
for (const pool_id in this.state.pg.stats)
{
if (subop_stats[op].count >= 0x10000000000000000n)
for (const pg_num in this.state.pg.stats[pool_id])
{
if (!this.prev_stats.subop_stats[op])
{
overflow = true;
}
else
const st = this.state.pg.stats[pool_id][pg_num];
for (const k in object_counts)
{
subop_stats[op].count -= this.prev_stats.subop_stats[op].count;
subop_stats[op].usec -= this.prev_stats.subop_stats[op].usec;
if (st[k+'_count'])
{
object_counts[k] += BigInt(st[k+'_count']);
}
}
}
}
for (const op in recovery_stats)
return object_counts;
}

sum_inode_stats()
{
const inode_stats = {};
const inode_stub = () => ({
raw_used: 0n,
read: { count: 0n, usec: 0n, bytes: 0n },
write: { count: 0n, usec: 0n, bytes: 0n },
delete: { count: 0n, usec: 0n, bytes: 0n },
});
for (const osd_num in this.state.osd.space)
{
if (recovery_stats[op].count >= 0x10000000000000000n)
for (const inode_num in this.state.osd.space[osd_num])
{
if (!this.prev_stats.recovery_stats[op])
{
overflow = true;
}
else
{
recovery_stats[op].count -= this.prev_stats.recovery_stats[op].count;
recovery_stats[op].bytes -= this.prev_stats.recovery_stats[op].bytes;
}
inode_stats[inode_num] = inode_stats[inode_num] || inode_stub();
inode_stats[inode_num].raw_used += BigInt(this.state.osd.space[osd_num][inode_num]||0);
}
}
const object_counts = { object: 0n, clean: 0n, misplaced: 0n, degraded: 0n, incomplete: 0n };
for (const pool_id in this.state.pg.stats)
for (const osd_num in this.state.osd.inodestats)
{
for (const pg_num in this.state.pg.stats[pool_id])
const ist = this.state.osd.inodestats[osd_num];
for (const inode_num in ist)
{
const st = this.state.pg.stats[pool_id][pg_num];
for (const k in object_counts)
inode_stats[inode_num] = inode_stats[inode_num] || inode_stub();
for (const op of [ 'read', 'write', 'delete' ])
{
if (st[k+'_count'])
{
object_counts[k] += BigInt(st[k+'_count']);
}
inode_stats[inode][op].count += BigInt(ist[inode_num][op].count||0);
inode_stats[inode][op].usec += BigInt(ist[inode_num][op].usec||0);
inode_stats[inode][op].bytes += BigInt(ist[inode_num][op].bytes||0);
}
}
}
return (this.prev_stats = { overflow, op_stats, subop_stats, recovery_stats, object_counts });
return inode_stats;
}

async update_total_stats()
fix_stat_overflows(obj, scratch)
{
const txn = [];
const stats = this.sum_stats();
if (!stats.overflow)
for (const k in obj)
{
// Convert to strings, serialize and save
const ser = {};
for (const st of [ 'op_stats', 'subop_stats', 'recovery_stats' ])
if (typeof obj[k] == 'bigint')
{
ser[st] = {};
for (const op in stats[st])
if (obj[k] >= 0x10000000000000000n)
{
ser[st][op] = {};
for (const k in stats[st][op])
if (scratch[k])
{
for (const k2 in scratch)
{
obj[k2] -= scratch[k2];
scratch[k2] = 0n;
}
}
else
{
ser[st][op][k] = ''+stats[st][op][k];
for (const k2 in obj)
{
scratch[k2] = obj[k2];
}
}
}
}
ser.object_counts = {};
for (const k in stats.object_counts)
else if (typeof obj[k] == 'object')
{
ser.object_counts[k] = ''+stats.object_counts[k];
this.fix_stat_overflows(obj[k], scratch[k] = (scratch[k] || {}));
}
txn.push({ requestPut: { key: b64(this.etcd_prefix+'/stats'), value: b64(JSON.stringify(ser)) } });
}
const space_stats = {};
for (const osd_num in this.state.osd.space)
}

serialize_bigints(obj)
{
for (const k in obj)
{
for (const inode_num in this.state.osd.space[osd_num])
if (typeof obj[k] == 'bigint')
{
obj[k] = ''+obj[k];
}
else if (typeof obj[k] == 'object')
{
space_stats[inode_num] = (space_stats[inode_num] || BigInt(0)) + BigInt(this.state.osd.space[osd_num][inode_num]||0);
this.serialize_bigints(obj[k]);
}
}
for (const inode_num in space_stats)
}

async update_total_stats()
{
const txn = [];
const stats = this.sum_stats();
const object_counts = this.sum_object_counts();
const inode_stats = this.sum_inode_stats();
this.fix_stat_overflows(stats, (this.prev_stats = this.prev_stats || {}));
this.fix_stat_overflows(inode_stats, (this.prev_inode_stats = this.prev_inode_stats || {}));
stats.object_counts = object_counts;
this.serialize_bigints(stats);
this.serialize_bigints(inode_stats);
txn.push({ requestPut: { key: b64(this.etcd_prefix+'/stats'), value: b64(JSON.stringify(stats)) } });
for (const inode_num in inode_stats)
{
txn.push({ requestPut: {
key: b64(this.etcd_prefix+'/inode/space/'+inode_num),
value: b64(JSON.stringify({ raw: ''+space_stats[inode_num] })),
key: b64(this.etcd_prefix+'/inode/stats/'+inode_num),
value: b64(JSON.stringify(inode_stats[inode_num])),
} });
}
if (txn.length)


+ 6
- 4
msgr_send.cpp View File

@@ -103,8 +103,10 @@ void osd_messenger_t::outbox_push(osd_op_t *cur_op)
void osd_messenger_t::measure_exec(osd_op_t *cur_op)
{
// Measure execution latency
timespec tv_end;
clock_gettime(CLOCK_REALTIME, &tv_end);
if (!cur_op->tv_end.tv_sec)
{
clock_gettime(CLOCK_REALTIME, &cur_op->tv_end);
}
stats.op_stat_count[cur_op->req.hdr.opcode]++;
if (!stats.op_stat_count[cur_op->req.hdr.opcode])
{
@@ -113,8 +115,8 @@ void osd_messenger_t::measure_exec(osd_op_t *cur_op)
stats.op_stat_bytes[cur_op->req.hdr.opcode] = 0;
}
stats.op_stat_sum[cur_op->req.hdr.opcode] += (
(tv_end.tv_sec - cur_op->tv_begin.tv_sec)*1000000 +
(tv_end.tv_nsec - cur_op->tv_begin.tv_nsec)/1000
(cur_op->tv_end.tv_sec - cur_op->tv_begin.tv_sec)*1000000 +
(cur_op->tv_end.tv_nsec - cur_op->tv_begin.tv_nsec)/1000
);
if (cur_op->req.hdr.opcode == OSD_OP_READ ||
cur_op->req.hdr.opcode == OSD_OP_WRITE)


+ 12
- 0
osd.h View File

@@ -54,6 +54,17 @@ struct osd_recovery_op_t
osd_op_t *osd_op = NULL;
};

// Posted as /osd/inodestats/$osd, then accumulated by the monitor
#define INODE_STATS_READ 0
#define INODE_STATS_WRITE 1
#define INODE_STATS_DELETE 2
struct inode_stats_t
{
uint64_t op_sum[3] = { 0 };
uint64_t op_count[3] = { 0 };
uint64_t op_bytes[3] = { 0 };
};

class osd_t
{
// config
@@ -120,6 +131,7 @@ class osd_t

// op statistics
osd_op_stats_t prev_stats;
std::map<uint64_t, inode_stats_t> inode_stats;
const char* recovery_stat_names[2] = { "degraded", "misplaced" };
uint64_t recovery_stat_count[2][2] = { 0 };
uint64_t recovery_stat_bytes[2][2] = { 0 };


+ 25
- 0
osd_cluster.cpp View File

@@ -186,6 +186,27 @@ void osd_t::report_statistics()
{
inode_space[std::to_string(kv.first)] = kv.second;
}
json11::Json::object inode_ops;
for (auto kv: inode_stats)
{
inode_ops[std::to_string(kv.first)] = json11::Json::object {
{ "read", json11::Json::object {
{ "count", kv.second.op_count[INODE_STATS_READ] },
{ "usec", kv.second.op_sum[INODE_STATS_READ] },
{ "bytes", kv.second.op_bytes[INODE_STATS_READ] },
} },
{ "write", json11::Json::object {
{ "count", kv.second.op_count[INODE_STATS_WRITE] },
{ "usec", kv.second.op_sum[INODE_STATS_WRITE] },
{ "bytes", kv.second.op_bytes[INODE_STATS_WRITE] },
} },
{ "delete", json11::Json::object {
{ "count", kv.second.op_count[INODE_STATS_DELETE] },
{ "usec", kv.second.op_sum[INODE_STATS_DELETE] },
{ "bytes", kv.second.op_bytes[INODE_STATS_DELETE] },
} },
};
}
json11::Json::array txn = { json11::Json::object {
{ "request_put", json11::Json::object {
{ "key", base64_encode(st_cli.etcd_prefix+"/osd/stats/"+std::to_string(osd_num)) },
@@ -195,6 +216,10 @@ void osd_t::report_statistics()
{ "key", base64_encode(st_cli.etcd_prefix+"/osd/space/"+std::to_string(osd_num)) },
{ "value", base64_encode(json11::Json(inode_space).dump()) },
} },
{ "request_put", json11::Json::object {
{ "key", base64_encode(st_cli.etcd_prefix+"/osd/inodestats/"+std::to_string(osd_num)) },
{ "value", base64_encode(json11::Json(inode_ops).dump()) },
} },
} };
for (auto & p: pgs)
{


+ 24
- 1
osd_primary_subops.cpp View File

@@ -36,6 +36,29 @@ void osd_t::autosync()
void osd_t::finish_op(osd_op_t *cur_op, int retval)
{
inflight_ops--;
if (cur_op->req.hdr.opcode == OSD_OP_READ ||
cur_op->req.hdr.opcode == OSD_OP_WRITE ||
cur_op->req.hdr.opcode == OSD_OP_DELETE)
{
// Track inode statistics
if (!cur_op->tv_end.tv_sec)
{
clock_gettime(CLOCK_REALTIME, &cur_op->tv_end);
}
uint64_t usec = (
(cur_op->tv_end.tv_sec - cur_op->tv_begin.tv_sec)*1000000 +
(cur_op->tv_end.tv_nsec - cur_op->tv_begin.tv_nsec)/1000
);
int inode_st_op = cur_op->req.hdr.opcode == OSD_OP_DELETE
? INODE_STATS_DELETE
: (cur_op->req.hdr.opcode == OSD_OP_READ ? INODE_STATS_READ : INODE_STATS_WRITE);
inode_stats[cur_op->req.rw.inode].op_count[inode_st_op]++;
inode_stats[cur_op->req.rw.inode].op_sum[inode_st_op] += usec;
if (cur_op->req.hdr.opcode == OSD_OP_DELETE)
inode_stats[cur_op->req.rw.inode].op_bytes[inode_st_op] += cur_op->op_data->pg_data_size * bs_block_size;
else
inode_stats[cur_op->req.rw.inode].op_bytes[inode_st_op] += cur_op->req.rw.len;
}
if (cur_op->op_data)
{
if (cur_op->op_data->pg_num > 0)
@@ -62,7 +85,7 @@ void osd_t::finish_op(osd_op_t *cur_op, int retval)
}
else
{
// FIXME add separate magic number
// FIXME add separate magic number for primary ops
auto cl_it = c_cli.clients.find(cur_op->peer_fd);
if (cl_it != c_cli.clients.end())
{


Loading…
Cancel
Save