Wrap cluster AFR calculator in a webpage

master
Vitaliy Filippov 2021-01-15 15:45:01 +03:00
commit 07fbb7e98c
7 changed files with 382 additions and 0 deletions

7
.babelrc Normal file
View File

@ -0,0 +1,7 @@
{
"presets": [ [ "env" ], "stage-1" ],
"retainLines": true,
"plugins": [
[ "transform-react-jsx", { "pragma": "preact.h" } ]
]
}

145
afr.js Normal file
View File

@ -0,0 +1,145 @@
// Functions to calculate Annualized Failure Rate of your cluster
// if you know AFR of your drives, number of drives, expected rebalance time
// and replication factor
// License: VNPL-1.0 (see https://yourcmc.ru/git/vitalif/vitastor/src/branch/master/README.md for details) or AGPL-3.0
// Author: Vitaliy Filippov, 2020+
module.exports = {
cluster_afr_fullmesh,
failure_rate_fullmesh,
cluster_afr,
print_cluster_afr,
c_n_k,
};
/******** "FULL MESH": ASSUME EACH OSD COMMUNICATES WITH ALL OTHER OSDS ********/
// Estimate AFR of the cluster
// n - number of drives
// afr - annualized failure rate of a single drive
// l - expected rebalance time in days after a single drive failure
// k - replication factor / number of drives that must fail at the same time for the cluster to fail
function cluster_afr_fullmesh(n, afr, l, k)
{
return 1 - (1 - afr * failure_rate_fullmesh(n-(k-1), afr*l/365, k-1)) ** (n-(k-1));
}
// Probability of at least <f> failures in a cluster with <n> drives with AFR=<a>
function failure_rate_fullmesh(n, a, f)
{
if (f <= 0)
{
return (1-a)**n;
}
let p = 1;
for (let i = 0; i < f; i++)
{
p -= c_n_k(n, i) * (1-a)**(n-i) * a**i;
}
return p;
}
/******** PGS: EACH OSD ONLY COMMUNICATES WITH <pgs> OTHER OSDs ********/
// <n> hosts of <m> drives of <capacity> GB, each able to backfill at <speed> GB/s,
// <k> replicas, <pgs> unique peer PGs per OSD
//
// For each of n*m drives: P(drive fails in a year) * P(any of its peers fail in <l*365> next days).
// More peers per OSD increase rebalance speed (more drives work together to resilver) if you
// let them finish rebalance BEFORE replacing the failed drive.
// At the same time, more peers per OSD increase probability of any of them to fail!
//
// Probability of all except one drives in a replica group to fail is (AFR^(k-1)).
// So with <x> PGs it becomes ~ (x * (AFR*L/365)^(k-1)). Interesting but reasonable consequence
// is that, with k=2, total failure rate doesn't depend on number of peers per OSD,
// because it gets increased linearly by increased number of peers to fail
// and decreased linearly by reduced rebalance time.
function cluster_afr_pgs({ n_hosts, n_drives, afr_drive, capacity, speed, replicas, pgs = 1, degraded_replacement })
{
pgs = Math.min(pgs, (n_hosts-1)*n_drives/(replicas-1));
const l = capacity/(degraded_replacement ? 1 : pgs)/speed/86400/365;
return 1 - (1 - afr_drive * (1-(1-(afr_drive*l)**(replicas-1))**pgs)) ** (n_hosts*n_drives);
}
function cluster_afr_pgs_ec({ n_hosts, n_drives, afr_drive, capacity, speed, ec: [ ec_data, ec_parity ], pgs = 1, degraded_replacement })
{
const ec_total = ec_data+ec_parity;
pgs = Math.min(pgs, (n_hosts-1)*n_drives/(ec_total-1));
const l = capacity/(degraded_replacement ? 1 : pgs)/speed/86400/365;
return 1 - (1 - afr_drive * (1-(1-failure_rate_fullmesh(ec_total-1, afr_drive*l, ec_parity))**pgs)) ** (n_hosts*n_drives);
}
// Same as above, but also take server failures into account
function cluster_afr_pgs_hosts({ n_hosts, n_drives, afr_drive, afr_host, capacity, speed, replicas, pgs = 1, degraded_replacement })
{
let otherhosts = Math.min(pgs, (n_hosts-1)/(replicas-1));
pgs = Math.min(pgs, (n_hosts-1)*n_drives/(replicas-1));
let pgh = Math.min(pgs*n_drives, (n_hosts-1)*n_drives/(replicas-1));
const ld = capacity/(degraded_replacement ? 1 : pgs)/speed/86400/365;
const lh = n_drives*capacity/pgs/speed/86400/365;
const p1 = ((afr_drive+afr_host*pgs/otherhosts)*lh);
const p2 = ((afr_drive+afr_host*pgs/otherhosts)*ld);
return 1 - ((1 - afr_host * (1-(1-p1**(replicas-1))**pgh)) ** n_hosts) *
((1 - afr_drive * (1-(1-p2**(replicas-1))**pgs)) ** (n_hosts*n_drives));
}
function cluster_afr_pgs_ec_hosts({ n_hosts, n_drives, afr_drive, afr_host, capacity, speed, ec: [ ec_data, ec_parity ], pgs = 1, degraded_replacement })
{
const ec_total = ec_data+ec_parity;
const otherhosts = Math.min(pgs, (n_hosts-1)/(ec_total-1));
pgs = Math.min(pgs, (n_hosts-1)*n_drives/(ec_total-1));
const pgh = Math.min(pgs*n_drives, (n_hosts-1)*n_drives/(ec_total-1));
const ld = capacity/(degraded_replacement ? 1 : pgs)/speed/86400/365;
const lh = n_drives*capacity/pgs/speed/86400/365;
const p1 = ((afr_drive+afr_host*pgs/otherhosts)*lh);
const p2 = ((afr_drive+afr_host*pgs/otherhosts)*ld);
return 1 - ((1 - afr_host * (1-(1-failure_rate_fullmesh(ec_total-1, p1, ec_parity))**pgh)) ** n_hosts) *
((1 - afr_drive * (1-(1-failure_rate_fullmesh(ec_total-1, p2, ec_parity))**pgs)) ** (n_hosts*n_drives));
}
// Wrapper for 4 above functions
function cluster_afr(config)
{
if (config.ec && config.afr_host)
{
return cluster_afr_pgs_ec_hosts(config);
}
else if (config.ec)
{
return cluster_afr_pgs_ec(config);
}
else if (config.afr_host)
{
return cluster_afr_pgs_hosts(config);
}
else
{
return cluster_afr_pgs(config);
}
}
function print_cluster_afr(config)
{
console.log(
`${config.n_hosts} nodes with ${config.n_drives} ${sprintf("%.1f", config.capacity/1000)}TB drives`+
`, capable to backfill at ${sprintf("%.1f", config.speed*1000)} MB/s, drive AFR ${sprintf("%.1f", config.afr_drive*100)}%`+
(config.afr_host ? `, host AFR ${sprintf("%.1f", config.afr_host*100)}%` : '')+
(config.ec ? `, EC ${config.ec[0]}+${config.ec[1]}` : `, ${config.replicas} replicas`)+
`, ${config.pgs||1} PG per OSD`+
(config.degraded_replacement ? `\n...and you don't let the rebalance finish before replacing drives` : '')
);
console.log('-> '+sprintf("%.7f%%", 100*cluster_afr(config))+'\n');
}
/******** UTILITY ********/
// Combination count
function c_n_k(n, k)
{
let r = 1;
for (let i = 0; i < k; i++)
{
r *= (n-i) / (i+1);
}
return r;
}

0
dist/.gitkeep vendored Normal file
View File

25
index.html Normal file
View File

@ -0,0 +1,25 @@
<!DOCTYPE html>
<html>
<head>
<title>Калькулятор вероятности отказа кластера Ceph/Vitastor</title>
<meta charset="utf-8" />
<style>
* { box-sizing: border-box; }
body { margin: 0; font-size: 15px; font-family: Arial, Helvetica, sans-serif; }
input { font-size: inherit; font-family: inherit; vertical-align: middle; }
table { border-collapse: collapse; margin-left: 110px; }
td { padding: 5px; }
th { text-align: left; font-weight: normal; white-space: nowrap; width: 1%; }
input[type="text"] { border: 1px solid #aaa; padding: 4px; border-radius: 3px; }
.switch { float: left; border: 1px solid #aaa; color: #666; height: 30px; padding: 5px 10px; transition: all 200ms ease-in-out; cursor: pointer; position: relative; overflow: hidden; }
.switch input { visibility: hidden; position: absolute; top: 0; left: 0; right: 0; bottom: 0; }
.switch.l { border-right-width: 0; border-radius: 20px 0 0 20px; padding-left: 15px; }
.switch.r { border-left-width: 0; border-radius: 0 20px 20px 0; padding-right: 15px; }
.switch.sel { border-color: #08f; background: #08f; color: white; }
.switch:hover { border-color: #4af; background: #4af; color: white; }
</style>
</head>
<body>
</body>
<script type="text/javascript" src="dist/main.js"></script>
</html>

151
main.js Normal file
View File

@ -0,0 +1,151 @@
import * as preact from 'preact';
/** @jsx preact.h */
import { cluster_afr } from './afr.js';
class Calc extends preact.Component
{
state = {
hosts: 10,
drives: 10,
afr_drive: 3,
afr_host: 5,
capacity: 8,
speed: 20,
ec: false,
replicas: 2,
ec_data: 2,
ec_parity: 1,
eager: false,
result: 0,
}
calc(st)
{
st = { ...this.state, ...st };
st.result = 100*cluster_afr({
n_hosts: st.hosts,
n_drives: st.drives,
afr_drive: st.afr_drive/100,
afr_host: st.afr_host/100,
capacity: st.capacity*1000,
speed: st.speed/1000,
ec: st.ec ? [ st.ec_data, st.ec_parity ] : null,
replicas: st.replicas,
pgs: 100,
degraded_replacement: st.eager,
});
this.setState(st);
}
setter(field)
{
if (!this.setter[field])
{
this.setter[field] = (event) =>
{
this.calc({ [field]: event.target.value });
};
}
return this.setter[field];
}
setRepl = () =>
{
this.calc({ ec: false });
}
setEC = () =>
{
this.calc({ ec: true });
}
setEager = (event) =>
{
this.calc({ eager: event.target.checked });
}
componentDidMount()
{
this.calc({});
}
render(props, state)
{
return (<div style="width: 750px; margin: 20px; padding: 20px; box-shadow: 0 19px 60px rgba(0, 0, 0, 0.3), 0 15px 20px rgba(0, 0, 0, 0.22);">
<h2 style="text-align: center; font-size: 150%; margin: 10px 0 20px 0; font-weight: bold">
Калькулятор вероятности отказа кластера Ceph/Vitastor
</h2>
<p>
Вероятность полного отказа кластера зависит от числа серверов и дисков
(чем их больше, тем вероятность больше), от схемы избыточности, скорости ребаланса (восстановления),
и, конечно, непосредственно вероятности выхода из строя самих дисков и серверов.
</p>
<p>
Расчёт ведётся в простом предположении, что отказы распределены равномерно во времени.
</p>
<table>
<tr>
<th>Число серверов</th>
<td><input type="text" value={state.hosts} onchange={this.setter('hosts')} /></td>
</tr>
<tr>
<th>Число дисков в сервере</th>
<td><input type="text" value={state.drives} onchange={this.setter('drives')} /></td>
</tr>
<tr>
<th>Ёмкость дисков</th>
<td><input type="text" value={state.capacity} onchange={this.setter('capacity')} /> ТБ</td>
</tr>
<tr>
<th>Схема избыточности</th>
<td>
<label class={"switch l"+(state.ec ? "" : " sel")}>
<input type="radio" name="scheme" checked={!state.ec} onclick={this.setRepl} /> Репликация
</label>
<label class={"switch r"+(state.ec ? " sel" : "")}>
<input type="radio" name="scheme" checked={state.ec} onclick={this.setEC} /> EC (коды коррекции ошибок)
</label>
</td>
</tr>
{state.ec ? null : <tr>
<th>Число реплик</th>
<td><input type="text" value={state.replicas} onchange={this.setter('replicas')} /></td>
</tr>}
{state.ec ? <tr>
<th>Число дисков данных</th>
<td><input type="text" value={state.ec_data} onchange={this.setter('ec_data')} /></td>
</tr> : null}
{state.ec ? <tr>
<th>Число дисков чётности</th>
<td><input type="text" value={state.ec_parity} onchange={this.setter('ec_parity')} /></td>
</tr> : null}
<tr>
<th>Оценочная скорость<br />восстановления на 1 OSD</th>
<td><input type="text" value={state.speed} onchange={this.setter('speed')} /> МБ/с</td>
</tr>
<tr>
<th><abbr title="Annualized Failure Rate, вероятность отказа в течение года в %">AFR</abbr> диска</th>
<td><input type="text" value={state.afr_drive} onchange={this.setter('afr_drive')} /> %</td>
</tr>
<tr>
<th>AFR сервера</th>
<td><input type="text" value={state.afr_host} onchange={this.setter('afr_host')} /> %</td>
</tr>
</table>
<p>
<label><input type="checkbox" checked={state.eager} onchange={this.setEager} />
Я нетерпеливый и заменяю отказавший диск сразу, не давая данным уехать на остальные диски
(либо данным уезжать некуда, например, сервера всего 3 при 3 репликах)
</label>
</p>
<div style="text-align: center; font-size: 150%; margin: 20px 0; font-weight: bold">
Вероятность потери данных в течение года:
</div>
<div style="text-align: center; font-size: 200%; margin: 10px 0; font-weight: bold">
{Math.round(state.result*10000)/10000} %
</div>
</div>);
}
}
preact.render(<Calc />, document.body);

26
package.json Normal file
View File

@ -0,0 +1,26 @@
{
"name": "ceph-afr-calc",
"version": "1.0.0",
"description": "Ceph/Vitastor cluster failure calculator",
"main": "main.js",
"scripts": {
"build": "webpack",
"watch-dev": "NODE_ENV=development webpack --mode=development -w"
},
"author": "Vitaliy Filippov",
"license": "AGPL-3.0",
"devDependencies": {
"webpack-cli": "^4.3.1"
},
"dependencies": {
"babel-cli": "^6.26.0",
"babel-core": "^6.26.3",
"babel-loader": "^7.1.4",
"babel-preset-env": "^1.7.0",
"babel-preset-react": "^6.24.1",
"babel-preset-stage-1": "^6.24.1",
"preact": "^10.5.10",
"webpack": "^4.26.1",
"webpack-cli": "^3.0.8"
}
}

28
webpack.config.js Normal file
View File

@ -0,0 +1,28 @@
const webpack = require('webpack');
const path = require('path');
module.exports = {
entry: { 'main': [ './main.js' ] },
context: __dirname,
output: {
path: __dirname,
filename: './dist/[name].js'
},
devtool: 'cheap-module-source-map',
module: {
rules: [
{
test: /.jsx?$/,
loader: 'babel-loader',
exclude: /node_modules/
}
]
},
plugins: [
new webpack.DefinePlugin({
"process.env": {
NODE_ENV: JSON.stringify(process.env.NODE_ENV || "production")
}
})
]
};