From 9f58f01425357dcd47e0e23dfee2b7e6a9111e22 Mon Sep 17 00:00:00 2001
From: Vitaliy Filippov <vitalif@yourcmc.ru>
Date: Sun, 17 Jan 2021 18:58:50 +0300
Subject: [PATCH] Mirror afr.js from /vitalif/ceph-afr-calc

---
 mon/afr.js      | 116 ++++++++++--------------------------------------
 mon/afr_test.js |  28 ++++++++++++
 2 files changed, 51 insertions(+), 93 deletions(-)
 create mode 100644 mon/afr_test.js
diff --git a/mon/afr.js b/mon/afr.js
index 8bf84f6c..e3ffa5e8 100644
--- a/mon/afr.js
+++ b/mon/afr.js
@@ -1,31 +1,16 @@
 // Functions to calculate Annualized Failure Rate of your cluster
 // if you know AFR of your drives, number of drives, expected rebalance time
 // and replication factor
-// License: VNPL-1.0 (see README.md for details)
-
-const { sprintf } = require('sprintf-js');
+// License: VNPL-1.0 (see https://yourcmc.ru/git/vitalif/vitastor/src/branch/master/README.md for details) or AGPL-3.0
+// Author: Vitaliy Filippov, 2020+
 
 module.exports = {
     cluster_afr_fullmesh,
     failure_rate_fullmesh,
     cluster_afr,
-    print_cluster_afr,
     c_n_k,
 };
 
-print_cluster_afr({ n_hosts: 4, n_drives: 6, afr_drive: 0.03, afr_host: 0.05, capacity: 4000, speed: 0.1, replicas: 2 });
-print_cluster_afr({ n_hosts: 4, n_drives: 3, afr_drive: 0.03, capacity: 4000, speed: 0.1, replicas: 2 });
-print_cluster_afr({ n_hosts: 4, n_drives: 3, afr_drive: 0.03, afr_host: 0.05, capacity: 4000, speed: 0.1, replicas: 2 });
-print_cluster_afr({ n_hosts: 4, n_drives: 3, afr_drive: 0.03, capacity: 4000, speed: 0.1, ec: [ 2, 1 ] });
-print_cluster_afr({ n_hosts: 4, n_drives: 3, afr_drive: 0.03, afr_host: 0.05, capacity: 4000, speed: 0.1, ec: [ 2, 1 ] });
-print_cluster_afr({ n_hosts: 10, n_drives: 10, afr_drive: 0.1, capacity: 8000, speed: 0.02, replicas: 2 });
-print_cluster_afr({ n_hosts: 10, n_drives: 10, afr_drive: 0.1, afr_host: 0.05, capacity: 8000, speed: 0.02, replicas: 2 });
-print_cluster_afr({ n_hosts: 10, n_drives: 10, afr_drive: 0.1, capacity: 8000, speed: 0.02, replicas: 3 });
-print_cluster_afr({ n_hosts: 10, n_drives: 10, afr_drive: 0.1, afr_host: 0.05, capacity: 8000, speed: 0.02, replicas: 3 });
-print_cluster_afr({ n_hosts: 10, n_drives: 10, afr_drive: 0.1, capacity: 8000, speed: 0.02, replicas: 3, pgs: 100 });
-print_cluster_afr({ n_hosts: 10, n_drives: 10, afr_drive: 0.1, afr_host: 0.05, capacity: 8000, speed: 0.02, replicas: 3, pgs: 100 });
-print_cluster_afr({ n_hosts: 10, n_drives: 10, afr_drive: 0.1, afr_host: 0.05, capacity: 8000, speed: 0.02, replicas: 3, pgs: 100, degraded_replacement: 1 });
-
 /******** "FULL MESH": ASSUME EACH OSD COMMUNICATES WITH ALL OTHER OSDS ********/
 
 // Estimate AFR of the cluster
@@ -56,93 +41,38 @@ function failure_rate_fullmesh(n, a, f)
 /******** PGS: EACH OSD ONLY COMMUNICATES WITH <pgs> OTHER OSDs ********/
 
 // <n> hosts of <m> drives of <capacity> GB, each able to backfill at <speed> GB/s,
-// <k> replicas, <pgs> unique peer PGs per OSD
+// <k> replicas, <pgs> unique peer PGs per OSD (~50 for 100 PG-per-OSD in a big cluster)
 //
 // For each of n*m drives: P(drive fails in a year) * P(any of its peers fail in <l*365> next days).
 // More peers per OSD increase rebalance speed (more drives work together to resilver) if you
-// let them finish rebalance BEFORE replacing the failed drive.
+// let them finish rebalance BEFORE replacing the failed drive (degraded_replacement=false).
 // At the same time, more peers per OSD increase probability of any of them to fail!
+// osd_rm=true means that failed OSDs' data is rebalanced over all other hosts,
+// not over the same host as it's in Ceph by default (dead OSDs are marked 'out').
 //
 // Probability of all except one drives in a replica group to fail is (AFR^(k-1)).
 // So with <x> PGs it becomes ~ (x * (AFR*L/365)^(k-1)). Interesting but reasonable consequence
 // is that, with k=2, total failure rate doesn't depend on number of peers per OSD,
 // because it gets increased linearly by increased number of peers to fail
 // and decreased linearly by reduced rebalance time.
-function cluster_afr_pgs({ n_hosts, n_drives, afr_drive, capacity, speed, replicas, pgs = 1, degraded_replacement })
+function cluster_afr({ n_hosts, n_drives, afr_drive, afr_host, capacity, speed, ec, ec_data, ec_parity, replicas, pgs = 1, osd_rm, degraded_replacement, down_out_interval = 600 })
 {
-    pgs = Math.min(pgs, (n_hosts-1)*n_drives/(replicas-1));
-    const l = capacity/(degraded_replacement ? 1 : pgs)/speed/86400/365;
-    return 1 - (1 - afr_drive * (1-(1-(afr_drive*l)**(replicas-1))**pgs)) ** (n_hosts*n_drives);
-}
-
-function cluster_afr_pgs_ec({ n_hosts, n_drives, afr_drive, capacity, speed, ec: [ ec_data, ec_parity ], pgs = 1, degraded_replacement })
-{
-    const ec_total = ec_data+ec_parity;
-    pgs = Math.min(pgs, (n_hosts-1)*n_drives/(ec_total-1));
-    const l = capacity/(degraded_replacement ? 1 : pgs)/speed/86400/365;
-    return 1 - (1 - afr_drive * (1-(1-failure_rate_fullmesh(ec_total-1, afr_drive*l, ec_parity))**pgs)) ** (n_hosts*n_drives);
-}
-
-// Same as above, but also take server failures into account
-function cluster_afr_pgs_hosts({ n_hosts, n_drives, afr_drive, afr_host, capacity, speed, replicas, pgs = 1, degraded_replacement })
-{
-    let otherhosts = Math.min(pgs, (n_hosts-1)/(replicas-1));
-    pgs = Math.min(pgs, (n_hosts-1)*n_drives/(replicas-1));
-    let pgh = Math.min(pgs*n_drives, (n_hosts-1)*n_drives/(replicas-1));
-    const ld = capacity/(degraded_replacement ? 1 : pgs)/speed/86400/365;
-    const lh = n_drives*capacity/pgs/speed/86400/365;
-    const p1 = ((afr_drive+afr_host*pgs/otherhosts)*lh);
-    const p2 = ((afr_drive+afr_host*pgs/otherhosts)*ld);
-    return 1 - ((1 - afr_host * (1-(1-p1**(replicas-1))**pgh)) ** n_hosts) *
-        ((1 - afr_drive * (1-(1-p2**(replicas-1))**pgs)) ** (n_hosts*n_drives));
-}
-
-function cluster_afr_pgs_ec_hosts({ n_hosts, n_drives, afr_drive, afr_host, capacity, speed, ec: [ ec_data, ec_parity ], pgs = 1, degraded_replacement })
-{
-    const ec_total = ec_data+ec_parity;
-    const otherhosts = Math.min(pgs, (n_hosts-1)/(ec_total-1));
-    pgs = Math.min(pgs, (n_hosts-1)*n_drives/(ec_total-1));
-    const pgh = Math.min(pgs*n_drives, (n_hosts-1)*n_drives/(ec_total-1));
-    const ld = capacity/(degraded_replacement ? 1 : pgs)/speed/86400/365;
-    const lh = n_drives*capacity/pgs/speed/86400/365;
-    const p1 = ((afr_drive+afr_host*pgs/otherhosts)*lh);
-    const p2 = ((afr_drive+afr_host*pgs/otherhosts)*ld);
-    return 1 - ((1 - afr_host * (1-(1-failure_rate_fullmesh(ec_total-1, p1, ec_parity))**pgh)) ** n_hosts) *
-        ((1 - afr_drive * (1-(1-failure_rate_fullmesh(ec_total-1, p2, ec_parity))**pgs)) ** (n_hosts*n_drives));
-}
-
-// Wrapper for 4 above functions
-function cluster_afr(config)
-{
-    if (config.ec && config.afr_host)
-    {
-        return cluster_afr_pgs_ec_hosts(config);
-    }
-    else if (config.ec)
-    {
-        return cluster_afr_pgs_ec(config);
-    }
-    else if (config.afr_host)
-    {
-        return cluster_afr_pgs_hosts(config);
-    }
-    else
-    {
-        return cluster_afr_pgs(config);
-    }
-}
-
-function print_cluster_afr(config)
-{
-    console.log(
-        `${config.n_hosts} nodes with ${config.n_drives} ${sprintf("%.1f", config.capacity/1000)}TB drives`+
-        `, capable to backfill at ${sprintf("%.1f", config.speed*1000)} MB/s, drive AFR ${sprintf("%.1f", config.afr_drive*100)}%`+
-        (config.afr_host ? `, host AFR ${sprintf("%.1f", config.afr_host*100)}%` : '')+
-        (config.ec ? `, EC ${config.ec[0]}+${config.ec[1]}` : `, ${config.replicas} replicas`)+
-        `, ${config.pgs||1} PG per OSD`+
-        (config.degraded_replacement ? `\n...and you don't let the rebalance finish before replacing drives` : '')
-    );
-    console.log('-> '+sprintf("%.7f%%", 100*cluster_afr(config))+'\n');
+    const pg_size = (ec ? ec_data+ec_parity : replicas);
+    pgs = Math.min(pgs, (n_hosts-1)*n_drives/(pg_size-1));
+    const host_pgs = Math.min(pgs*n_drives, (n_hosts-1)*n_drives/(pg_size-1));
+    const resilver_disk = n_drives == 1 || osd_rm ? pgs : (n_drives-1);
+    const disk_heal_time = (down_out_interval + capacity/(degraded_replacement ? 1 : resilver_disk)/speed)/86400/365;
+    const host_heal_time = (down_out_interval + n_drives*capacity/pgs/speed)/86400/365;
+    const disk_heal_fail = ((afr_drive+afr_host/n_drives)*disk_heal_time);
+    const host_heal_fail = ((afr_drive+afr_host/n_drives)*host_heal_time);
+    const disk_pg_fail = ec
+        ? failure_rate_fullmesh(ec_data+ec_parity-1, disk_heal_fail, ec_parity)
+        : disk_heal_fail**(replicas-1);
+    const host_pg_fail = ec
+        ? failure_rate_fullmesh(ec_data+ec_parity-1, host_heal_fail, ec_parity)
+        : host_heal_fail**(replicas-1);
+    return 1 - ((1 - afr_drive * (1-(1-disk_pg_fail)**pgs)) ** (n_hosts*n_drives))
+        * ((1 - afr_host * (1-(1-host_pg_fail)**host_pgs)) ** n_hosts);
 }
 
 /******** UTILITY ********/
diff --git a/mon/afr_test.js b/mon/afr_test.js
new file mode 100644
index 00000000..80d7db68
--- /dev/null
+++ b/mon/afr_test.js
@@ -0,0 +1,28 @@
+const { sprintf } = require('sprintf-js');
+const { cluster_afr } = require('./afr.js');
+
+print_cluster_afr({ n_hosts: 4, n_drives: 6, afr_drive: 0.03, afr_host: 0.05, capacity: 4000, speed: 0.1, replicas: 2 });
+print_cluster_afr({ n_hosts: 4, n_drives: 3, afr_drive: 0.03, afr_host: 0, capacity: 4000, speed: 0.1, replicas: 2 });
+print_cluster_afr({ n_hosts: 4, n_drives: 3, afr_drive: 0.03, afr_host: 0.05, capacity: 4000, speed: 0.1, replicas: 2 });
+print_cluster_afr({ n_hosts: 4, n_drives: 3, afr_drive: 0.03, afr_host: 0, capacity: 4000, speed: 0.1, ec: true, ec_data: 2, ec_parity: 1 });
+print_cluster_afr({ n_hosts: 4, n_drives: 3, afr_drive: 0.03, afr_host: 0.05, capacity: 4000, speed: 0.1, ec: true, ec_data: 2, ec_parity: 1 });
+print_cluster_afr({ n_hosts: 10, n_drives: 10, afr_drive: 0.1, afr_host: 0, capacity: 8000, speed: 0.02, replicas: 2 });
+print_cluster_afr({ n_hosts: 10, n_drives: 10, afr_drive: 0.1, afr_host: 0.05, capacity: 8000, speed: 0.02, replicas: 2 });
+print_cluster_afr({ n_hosts: 10, n_drives: 10, afr_drive: 0.1, afr_host: 0, capacity: 8000, speed: 0.02, replicas: 3 });
+print_cluster_afr({ n_hosts: 10, n_drives: 10, afr_drive: 0.1, afr_host: 0.05, capacity: 8000, speed: 0.02, replicas: 3 });
+print_cluster_afr({ n_hosts: 10, n_drives: 10, afr_drive: 0.1, afr_host: 0, capacity: 8000, speed: 0.02, replicas: 3, pgs: 100 });
+print_cluster_afr({ n_hosts: 10, n_drives: 10, afr_drive: 0.1, afr_host: 0.05, capacity: 8000, speed: 0.02, replicas: 3, pgs: 100 });
+print_cluster_afr({ n_hosts: 10, n_drives: 10, afr_drive: 0.1, afr_host: 0.05, capacity: 8000, speed: 0.02, replicas: 3, pgs: 100, degraded_replacement: 1 });
+
+function print_cluster_afr(config)
+{
+    console.log(
+        `${config.n_hosts} nodes with ${config.n_drives} ${sprintf("%.1f", config.capacity/1000)}TB drives`+
+        `, capable to backfill at ${sprintf("%.1f", config.speed*1000)} MB/s, drive AFR ${sprintf("%.1f", config.afr_drive*100)}%`+
+        (config.afr_host ? `, host AFR ${sprintf("%.1f", config.afr_host*100)}%` : '')+
+        (config.ec ? `, EC ${config.ec_data}+${config.ec_parity}` : `, ${config.replicas} replicas`)+
+        `, ${config.pgs||1} PG per OSD`+
+        (config.degraded_replacement ? `\n...and you don't let the rebalance finish before replacing drives` : '')
+    );
+    console.log('-> '+sprintf("%.7f%%", 100*cluster_afr(config))+'\n');
+}