From 07fbb7e98cb3f953fbc869989fba8fe48ff4eb40 Mon Sep 17 00:00:00 2001 From: Vitaliy Filippov Date: Fri, 15 Jan 2021 15:45:01 +0300 Subject: [PATCH] Wrap cluster AFR calculator in a webpage --- .babelrc | 7 +++ afr.js | 145 ++++++++++++++++++++++++++++++++++++++++++++ dist/.gitkeep | 0 index.html | 25 ++++++++ main.js | 151 ++++++++++++++++++++++++++++++++++++++++++++++ package.json | 26 ++++++++ webpack.config.js | 28 +++++++++ 7 files changed, 382 insertions(+) create mode 100644 .babelrc create mode 100644 afr.js create mode 100644 dist/.gitkeep create mode 100644 index.html create mode 100644 main.js create mode 100644 package.json create mode 100644 webpack.config.js diff --git a/.babelrc b/.babelrc new file mode 100644 index 0000000..43af0e1 --- /dev/null +++ b/.babelrc @@ -0,0 +1,7 @@ +{ + "presets": [ [ "env" ], "stage-1" ], + "retainLines": true, + "plugins": [ + [ "transform-react-jsx", { "pragma": "preact.h" } ] + ] +} diff --git a/afr.js b/afr.js new file mode 100644 index 0000000..2153f09 --- /dev/null +++ b/afr.js @@ -0,0 +1,145 @@ +// Functions to calculate Annualized Failure Rate of your cluster +// if you know AFR of your drives, number of drives, expected rebalance time +// and replication factor +// License: VNPL-1.0 (see https://yourcmc.ru/git/vitalif/vitastor/src/branch/master/README.md for details) or AGPL-3.0 +// Author: Vitaliy Filippov, 2020+ + +module.exports = { + cluster_afr_fullmesh, + failure_rate_fullmesh, + cluster_afr, + print_cluster_afr, + c_n_k, +}; + +/******** "FULL MESH": ASSUME EACH OSD COMMUNICATES WITH ALL OTHER OSDS ********/ + +// Estimate AFR of the cluster +// n - number of drives +// afr - annualized failure rate of a single drive +// l - expected rebalance time in days after a single drive failure +// k - replication factor / number of drives that must fail at the same time for the cluster to fail +function cluster_afr_fullmesh(n, afr, l, k) +{ + return 1 - (1 - afr * failure_rate_fullmesh(n-(k-1), afr*l/365, k-1)) ** (n-(k-1)); +} + +// Probability of at least failures in a cluster with drives with AFR= +function failure_rate_fullmesh(n, a, f) +{ + if (f <= 0) + { + return (1-a)**n; + } + let p = 1; + for (let i = 0; i < f; i++) + { + p -= c_n_k(n, i) * (1-a)**(n-i) * a**i; + } + return p; +} + +/******** PGS: EACH OSD ONLY COMMUNICATES WITH OTHER OSDs ********/ + +// hosts of drives of GB, each able to backfill at GB/s, +// replicas, unique peer PGs per OSD +// +// For each of n*m drives: P(drive fails in a year) * P(any of its peers fail in next days). +// More peers per OSD increase rebalance speed (more drives work together to resilver) if you +// let them finish rebalance BEFORE replacing the failed drive. +// At the same time, more peers per OSD increase probability of any of them to fail! +// +// Probability of all except one drives in a replica group to fail is (AFR^(k-1)). +// So with PGs it becomes ~ (x * (AFR*L/365)^(k-1)). Interesting but reasonable consequence +// is that, with k=2, total failure rate doesn't depend on number of peers per OSD, +// because it gets increased linearly by increased number of peers to fail +// and decreased linearly by reduced rebalance time. +function cluster_afr_pgs({ n_hosts, n_drives, afr_drive, capacity, speed, replicas, pgs = 1, degraded_replacement }) +{ + pgs = Math.min(pgs, (n_hosts-1)*n_drives/(replicas-1)); + const l = capacity/(degraded_replacement ? 1 : pgs)/speed/86400/365; + return 1 - (1 - afr_drive * (1-(1-(afr_drive*l)**(replicas-1))**pgs)) ** (n_hosts*n_drives); +} + +function cluster_afr_pgs_ec({ n_hosts, n_drives, afr_drive, capacity, speed, ec: [ ec_data, ec_parity ], pgs = 1, degraded_replacement }) +{ + const ec_total = ec_data+ec_parity; + pgs = Math.min(pgs, (n_hosts-1)*n_drives/(ec_total-1)); + const l = capacity/(degraded_replacement ? 1 : pgs)/speed/86400/365; + return 1 - (1 - afr_drive * (1-(1-failure_rate_fullmesh(ec_total-1, afr_drive*l, ec_parity))**pgs)) ** (n_hosts*n_drives); +} + +// Same as above, but also take server failures into account +function cluster_afr_pgs_hosts({ n_hosts, n_drives, afr_drive, afr_host, capacity, speed, replicas, pgs = 1, degraded_replacement }) +{ + let otherhosts = Math.min(pgs, (n_hosts-1)/(replicas-1)); + pgs = Math.min(pgs, (n_hosts-1)*n_drives/(replicas-1)); + let pgh = Math.min(pgs*n_drives, (n_hosts-1)*n_drives/(replicas-1)); + const ld = capacity/(degraded_replacement ? 1 : pgs)/speed/86400/365; + const lh = n_drives*capacity/pgs/speed/86400/365; + const p1 = ((afr_drive+afr_host*pgs/otherhosts)*lh); + const p2 = ((afr_drive+afr_host*pgs/otherhosts)*ld); + return 1 - ((1 - afr_host * (1-(1-p1**(replicas-1))**pgh)) ** n_hosts) * + ((1 - afr_drive * (1-(1-p2**(replicas-1))**pgs)) ** (n_hosts*n_drives)); +} + +function cluster_afr_pgs_ec_hosts({ n_hosts, n_drives, afr_drive, afr_host, capacity, speed, ec: [ ec_data, ec_parity ], pgs = 1, degraded_replacement }) +{ + const ec_total = ec_data+ec_parity; + const otherhosts = Math.min(pgs, (n_hosts-1)/(ec_total-1)); + pgs = Math.min(pgs, (n_hosts-1)*n_drives/(ec_total-1)); + const pgh = Math.min(pgs*n_drives, (n_hosts-1)*n_drives/(ec_total-1)); + const ld = capacity/(degraded_replacement ? 1 : pgs)/speed/86400/365; + const lh = n_drives*capacity/pgs/speed/86400/365; + const p1 = ((afr_drive+afr_host*pgs/otherhosts)*lh); + const p2 = ((afr_drive+afr_host*pgs/otherhosts)*ld); + return 1 - ((1 - afr_host * (1-(1-failure_rate_fullmesh(ec_total-1, p1, ec_parity))**pgh)) ** n_hosts) * + ((1 - afr_drive * (1-(1-failure_rate_fullmesh(ec_total-1, p2, ec_parity))**pgs)) ** (n_hosts*n_drives)); +} + +// Wrapper for 4 above functions +function cluster_afr(config) +{ + if (config.ec && config.afr_host) + { + return cluster_afr_pgs_ec_hosts(config); + } + else if (config.ec) + { + return cluster_afr_pgs_ec(config); + } + else if (config.afr_host) + { + return cluster_afr_pgs_hosts(config); + } + else + { + return cluster_afr_pgs(config); + } +} + +function print_cluster_afr(config) +{ + console.log( + `${config.n_hosts} nodes with ${config.n_drives} ${sprintf("%.1f", config.capacity/1000)}TB drives`+ + `, capable to backfill at ${sprintf("%.1f", config.speed*1000)} MB/s, drive AFR ${sprintf("%.1f", config.afr_drive*100)}%`+ + (config.afr_host ? `, host AFR ${sprintf("%.1f", config.afr_host*100)}%` : '')+ + (config.ec ? `, EC ${config.ec[0]}+${config.ec[1]}` : `, ${config.replicas} replicas`)+ + `, ${config.pgs||1} PG per OSD`+ + (config.degraded_replacement ? `\n...and you don't let the rebalance finish before replacing drives` : '') + ); + console.log('-> '+sprintf("%.7f%%", 100*cluster_afr(config))+'\n'); +} + +/******** UTILITY ********/ + +// Combination count +function c_n_k(n, k) +{ + let r = 1; + for (let i = 0; i < k; i++) + { + r *= (n-i) / (i+1); + } + return r; +} diff --git a/dist/.gitkeep b/dist/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/index.html b/index.html new file mode 100644 index 0000000..d95969e --- /dev/null +++ b/index.html @@ -0,0 +1,25 @@ + + + +Калькулятор вероятности отказа кластера Ceph/Vitastor + + + + + + + diff --git a/main.js b/main.js new file mode 100644 index 0000000..c3b27ce --- /dev/null +++ b/main.js @@ -0,0 +1,151 @@ +import * as preact from 'preact'; +/** @jsx preact.h */ +import { cluster_afr } from './afr.js'; + +class Calc extends preact.Component +{ + state = { + hosts: 10, + drives: 10, + afr_drive: 3, + afr_host: 5, + capacity: 8, + speed: 20, + ec: false, + replicas: 2, + ec_data: 2, + ec_parity: 1, + eager: false, + result: 0, + } + + calc(st) + { + st = { ...this.state, ...st }; + st.result = 100*cluster_afr({ + n_hosts: st.hosts, + n_drives: st.drives, + afr_drive: st.afr_drive/100, + afr_host: st.afr_host/100, + capacity: st.capacity*1000, + speed: st.speed/1000, + ec: st.ec ? [ st.ec_data, st.ec_parity ] : null, + replicas: st.replicas, + pgs: 100, + degraded_replacement: st.eager, + }); + this.setState(st); + } + + setter(field) + { + if (!this.setter[field]) + { + this.setter[field] = (event) => + { + this.calc({ [field]: event.target.value }); + }; + } + return this.setter[field]; + } + + setRepl = () => + { + this.calc({ ec: false }); + } + + setEC = () => + { + this.calc({ ec: true }); + } + + setEager = (event) => + { + this.calc({ eager: event.target.checked }); + } + + componentDidMount() + { + this.calc({}); + } + + render(props, state) + { + return (
+

+ Калькулятор вероятности отказа кластера Ceph/Vitastor +

+

+ Вероятность полного отказа кластера зависит от числа серверов и дисков + (чем их больше, тем вероятность больше), от схемы избыточности, скорости ребаланса (восстановления), + и, конечно, непосредственно вероятности выхода из строя самих дисков и серверов. +

+

+ Расчёт ведётся в простом предположении, что отказы распределены равномерно во времени. +

+ + + + + + + + + + + + + + + + + + {state.ec ? null : + + + } + {state.ec ? + + + : null} + {state.ec ? + + + : null} + + + + + + + + + + + + +
Число серверов
Число дисков в сервере
Ёмкость дисков ТБ
Схема избыточности + + +
Число реплик
Число дисков данных
Число дисков чётности
Оценочная скорость
восстановления на 1 OSD
МБ/с
AFR диска %
AFR сервера %
+

+ +

+
+ Вероятность потери данных в течение года: +
+
+ {Math.round(state.result*10000)/10000} % +
+
); + } +} + +preact.render(, document.body); diff --git a/package.json b/package.json new file mode 100644 index 0000000..ab11380 --- /dev/null +++ b/package.json @@ -0,0 +1,26 @@ +{ + "name": "ceph-afr-calc", + "version": "1.0.0", + "description": "Ceph/Vitastor cluster failure calculator", + "main": "main.js", + "scripts": { + "build": "webpack", + "watch-dev": "NODE_ENV=development webpack --mode=development -w" + }, + "author": "Vitaliy Filippov", + "license": "AGPL-3.0", + "devDependencies": { + "webpack-cli": "^4.3.1" + }, + "dependencies": { + "babel-cli": "^6.26.0", + "babel-core": "^6.26.3", + "babel-loader": "^7.1.4", + "babel-preset-env": "^1.7.0", + "babel-preset-react": "^6.24.1", + "babel-preset-stage-1": "^6.24.1", + "preact": "^10.5.10", + "webpack": "^4.26.1", + "webpack-cli": "^3.0.8" + } +} diff --git a/webpack.config.js b/webpack.config.js new file mode 100644 index 0000000..c0456cc --- /dev/null +++ b/webpack.config.js @@ -0,0 +1,28 @@ +const webpack = require('webpack'); +const path = require('path'); + +module.exports = { + entry: { 'main': [ './main.js' ] }, + context: __dirname, + output: { + path: __dirname, + filename: './dist/[name].js' + }, + devtool: 'cheap-module-source-map', + module: { + rules: [ + { + test: /.jsx?$/, + loader: 'babel-loader', + exclude: /node_modules/ + } + ] + }, + plugins: [ + new webpack.DefinePlugin({ + "process.env": { + NODE_ENV: JSON.stringify(process.env.NODE_ENV || "production") + } + }) + ] +};