There is also a down_out_interval

Dead OSDs are marked "out", so only OSDs from the same hosts work to resilver
2021-01-15 19:29:07 +03:00 · 2021-01-15 19:29:07 +03:00
2 changed files with 28 additions and 22 deletions
--- a/afr.js
+++ b/afr.js
@ -42,57 +42,63 @@ function failure_rate_fullmesh(n, a, f)
 /******** PGS: EACH OSD ONLY COMMUNICATES WITH <pgs> OTHER OSDs ********/

 // <n> hosts of <m> drives of <capacity> GB, each able to backfill at <speed> GB/s,
-// <k> replicas, <pgs> unique peer PGs per OSD
+// <k> replicas, <pgs> unique peer PGs per OSD (~50 for 100 PG-per-OSD in a big cluster)
 //
 // For each of n*m drives: P(drive fails in a year) * P(any of its peers fail in <l*365> next days).
 // More peers per OSD increase rebalance speed (more drives work together to resilver) if you
-// let them finish rebalance BEFORE replacing the failed drive.
+// let them finish rebalance BEFORE replacing the failed drive (degraded_replacement=false).
 // At the same time, more peers per OSD increase probability of any of them to fail!
+// osd_rm=true means that failed OSDs' data is rebalanced over all other hosts,
+// not over the same host as it's in Ceph by default (dead OSDs are marked 'out').
 //
 // Probability of all except one drives in a replica group to fail is (AFR^(k-1)).
 // So with <x> PGs it becomes ~ (x * (AFR*L/365)^(k-1)). Interesting but reasonable consequence
 // is that, with k=2, total failure rate doesn't depend on number of peers per OSD,
 // because it gets increased linearly by increased number of peers to fail
 // and decreased linearly by reduced rebalance time.
-function cluster_afr_pgs({ n_hosts, n_drives, afr_drive, capacity, speed, replicas, pgs = 1, degraded_replacement })
+function cluster_afr_pgs({ n_hosts, n_drives, afr_drive, capacity, speed, replicas, pgs = 1, osd_rm, degraded_replacement, down_out_interval = 600 })
 {
    pgs = Math.min(pgs, (n_hosts-1)*n_drives/(replicas-1));
-    const l = capacity/(degraded_replacement ? 1 : pgs)/speed/86400/365;
-    return 1 - (1 - afr_drive * (1-(1-(afr_drive*l)**(replicas-1))**pgs)) ** (n_hosts*n_drives);
+    const resilver_disk = n_drives == 1 || osd_rm ? pgs : (n_drives-1);
+    const disk_heal_time = (down_out_interval + capacity/(degraded_replacement ? 1 : resilver_disk)/speed)/86400/365;
+    return 1 - (1 - afr_drive * (1-(1-(afr_drive*disk_heal_time)**(replicas-1))**pgs)) ** (n_hosts*n_drives);
 }

-function cluster_afr_pgs_ec({ n_hosts, n_drives, afr_drive, capacity, speed, ec: [ ec_data, ec_parity ], pgs = 1, degraded_replacement })
+function cluster_afr_pgs_ec({ n_hosts, n_drives, afr_drive, capacity, speed, ec: [ ec_data, ec_parity ], pgs = 1, osd_rm, degraded_replacement, down_out_interval = 600 })
 {
    const ec_total = ec_data+ec_parity;
    pgs = Math.min(pgs, (n_hosts-1)*n_drives/(ec_total-1));
-    const l = capacity/(degraded_replacement ? 1 : pgs)/speed/86400/365;
-    return 1 - (1 - afr_drive * (1-(1-failure_rate_fullmesh(ec_total-1, afr_drive*l, ec_parity))**pgs)) ** (n_hosts*n_drives);
+    const resilver_disk = n_drives == 1 || osd_rm ? pgs : (n_drives-1);
+    const disk_heal_time = (down_out_interval + capacity/(degraded_replacement ? 1 : resilver_disk)/speed)/86400/365;
+    return 1 - (1 - afr_drive * (1-(1-failure_rate_fullmesh(ec_total-1, afr_drive*disk_heal_time, ec_parity))**pgs)) ** (n_hosts*n_drives);
 }

 // Same as above, but also take server failures into account
-function cluster_afr_pgs_hosts({ n_hosts, n_drives, afr_drive, afr_host, capacity, speed, replicas, pgs = 1, degraded_replacement })
+function cluster_afr_pgs_hosts({ n_hosts, n_drives, afr_drive, afr_host, capacity, speed, replicas, pgs = 1, osd_rm, degraded_replacement, down_out_interval = 600 })
 {
-    let otherhosts = Math.min(pgs, (n_hosts-1)/(replicas-1));
+    const otherhosts = Math.min(pgs, (n_hosts-1)/(replicas-1));
    pgs = Math.min(pgs, (n_hosts-1)*n_drives/(replicas-1));
-    let pgh = Math.min(pgs*n_drives, (n_hosts-1)*n_drives/(replicas-1));
-    const ld = capacity/(degraded_replacement ? 1 : pgs)/speed/86400/365;
-    const lh = n_drives*capacity/pgs/speed/86400/365;
-    const p1 = ((afr_drive+afr_host*pgs/otherhosts)*lh);
-    const p2 = ((afr_drive+afr_host*pgs/otherhosts)*ld);
+    const resilver_disk = n_drives == 1 || osd_rm ? pgs : (n_drives-1);
+    const pgh = Math.min(pgs*n_drives, (n_hosts-1)*n_drives/(replicas-1));
+    const disk_heal_time = (down_out_interval + capacity/(degraded_replacement ? 1 : resilver_disk)/speed)/86400/365;
+    const host_heal_time = (down_out_interval + n_drives*capacity/pgs/speed)/86400/365;
+    const p1 = ((afr_drive+afr_host*pgs/otherhosts)*host_heal_time);
+    const p2 = ((afr_drive+afr_host*pgs/otherhosts)*disk_heal_time);
    return 1 - ((1 - afr_host * (1-(1-p1**(replicas-1))**pgh)) ** n_hosts) *
        ((1 - afr_drive * (1-(1-p2**(replicas-1))**pgs)) ** (n_hosts*n_drives));
 }

-function cluster_afr_pgs_ec_hosts({ n_hosts, n_drives, afr_drive, afr_host, capacity, speed, ec: [ ec_data, ec_parity ], pgs = 1, degraded_replacement })
+function cluster_afr_pgs_ec_hosts({ n_hosts, n_drives, afr_drive, afr_host, capacity, speed, ec: [ ec_data, ec_parity ], pgs = 1, osd_rm, degraded_replacement, down_out_interval = 600 })
 {
    const ec_total = ec_data+ec_parity;
    const otherhosts = Math.min(pgs, (n_hosts-1)/(ec_total-1));
    pgs = Math.min(pgs, (n_hosts-1)*n_drives/(ec_total-1));
+    const resilver_disk = n_drives == 1 || osd_rm ? pgs : (n_drives-1);
    const pgh = Math.min(pgs*n_drives, (n_hosts-1)*n_drives/(ec_total-1));
-    const ld = capacity/(degraded_replacement ? 1 : pgs)/speed/86400/365;
-    const lh = n_drives*capacity/pgs/speed/86400/365;
-    const p1 = ((afr_drive+afr_host*pgs/otherhosts)*lh);
-    const p2 = ((afr_drive+afr_host*pgs/otherhosts)*ld);
+    const disk_heal_time = (down_out_interval + capacity/(degraded_replacement ? 1 : resilver_disk)/speed)/86400/365;
+    const host_heal_time = (down_out_interval + n_drives*capacity/pgs/speed)/86400/365;
+    const p1 = ((afr_drive+afr_host*pgs/otherhosts)*host_heal_time);
+    const p2 = ((afr_drive+afr_host*pgs/otherhosts)*disk_heal_time);
    return 1 - ((1 - afr_host * (1-(1-failure_rate_fullmesh(ec_total-1, p1, ec_parity))**pgh)) ** n_hosts) *
        ((1 - afr_drive * (1-(1-failure_rate_fullmesh(ec_total-1, p2, ec_parity))**pgs)) ** (n_hosts*n_drives));
 }
--- a/main.js
+++ b/main.js
@ -31,7 +31,7 @@ class Calc extends preact.Component
            speed: st.speed/1000,
            ec: st.ec ? [ st.ec_data, st.ec_parity ] : null,
            replicas: st.replicas,
-            pgs: 100,
+            pgs: 50,
            degraded_replacement: st.eager,
        });
        this.setState(st);
@ -67,7 +67,7 @@ class Calc extends preact.Component
    format4 = (n) =>
    {
        let p = Math.abs(n-(n|0)), m = 10000;
-        while (p != 0 && p < 0.1)
+        while (n < 1 && p != 0 && p < 0.1)
        {
            p = p*10;
            m = m*10;
Author	SHA1	Message	Date
Vitaliy Filippov	25e5b28204	There is also a down_out_interval	2021-01-15 19:29:07 +03:00
Vitaliy Filippov	62fe6bf681	Dead OSDs are marked "out", so only OSDs from the same hosts work to resilver	2021-01-15 19:29:07 +03:00