Begin object state calculation
parent
a8bc44064d
commit
d2a3f0c6dd
100
osd.h
100
osd.h
|
@ -10,7 +10,7 @@
|
||||||
#include <arpa/inet.h>
|
#include <arpa/inet.h>
|
||||||
#include <malloc.h>
|
#include <malloc.h>
|
||||||
|
|
||||||
#include <unordered_map>
|
#include <set>
|
||||||
#include <deque>
|
#include <deque>
|
||||||
|
|
||||||
#include "blockstore.h"
|
#include "blockstore.h"
|
||||||
|
@ -96,27 +96,49 @@ struct osd_client_t
|
||||||
int write_state = 0;
|
int write_state = 0;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct osd_pg_role_t
|
struct osd_obj_loc_t
|
||||||
{
|
{
|
||||||
// role = (stripe role: 1, 2, 3, ...) | (stable ? 0 : 1<<63)
|
|
||||||
uint64_t role;
|
uint64_t role;
|
||||||
uint64_t osd_num;
|
uint64_t osd_num;
|
||||||
|
bool stable;
|
||||||
};
|
};
|
||||||
|
|
||||||
typedef std::vector<osd_pg_role_t> osd_acting_set_t;
|
inline bool operator < (const osd_obj_loc_t &a, const osd_obj_loc_t &b)
|
||||||
|
{
|
||||||
|
return a.role < b.role || a.role == b.role && a.osd_num < b.osd_num;
|
||||||
|
}
|
||||||
|
|
||||||
|
struct osd_obj_state_t
|
||||||
|
{
|
||||||
|
std::vector<osd_obj_loc_t> loc;
|
||||||
|
uint64_t state = 0;
|
||||||
|
uint64_t object_count = 0;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct osd_ver_override_t
|
||||||
|
{
|
||||||
|
uint64_t max_ver;
|
||||||
|
uint64_t target_ver;
|
||||||
|
};
|
||||||
|
|
||||||
|
inline bool operator < (const osd_obj_state_t &a, const osd_obj_state_t &b)
|
||||||
|
{
|
||||||
|
return a.loc < b.loc;
|
||||||
|
}
|
||||||
|
|
||||||
namespace std
|
namespace std
|
||||||
{
|
{
|
||||||
template<> struct hash<osd_acting_set_t>
|
template<> struct hash<osd_obj_state_t>
|
||||||
{
|
{
|
||||||
inline size_t operator()(const osd_acting_set_t &s) const
|
inline size_t operator()(const osd_obj_state_t &s) const
|
||||||
{
|
{
|
||||||
size_t seed = 0;
|
size_t seed = 0;
|
||||||
for (int i = 0; i < s.size(); i++)
|
for (int i = 0; i < s.loc.size(); i++)
|
||||||
{
|
{
|
||||||
// Copy-pasted from spp::hash_combine()
|
// Copy-pasted from spp::hash_combine()
|
||||||
seed ^= (s[i].role + 0xc6a4a7935bd1e995 + (seed << 6) + (seed >> 2));
|
seed ^= (s.loc[i].role + 0xc6a4a7935bd1e995 + (seed << 6) + (seed >> 2));
|
||||||
seed ^= (s[i].osd_num + 0xc6a4a7935bd1e995 + (seed << 6) + (seed >> 2));
|
seed ^= (s.loc[i].osd_num + 0xc6a4a7935bd1e995 + (seed << 6) + (seed >> 2));
|
||||||
|
seed ^= ((s.loc[i].stable ? 1 : 0) + 0xc6a4a7935bd1e995 + (seed << 6) + (seed >> 2));
|
||||||
}
|
}
|
||||||
return seed;
|
return seed;
|
||||||
}
|
}
|
||||||
|
@ -135,18 +157,21 @@ namespace std
|
||||||
#define PG_HAS_MISPLACED (1<<6)
|
#define PG_HAS_MISPLACED (1<<6)
|
||||||
|
|
||||||
// OSD object states
|
// OSD object states
|
||||||
#define OSD_CLEAN 0x01
|
#define OBJ_CLEAN 0x01
|
||||||
#define OSD_MISPLACED 0x02
|
#define OBJ_MISPLACED 0x02
|
||||||
#define OSD_DEGRADED 0x03
|
#define OBJ_DEGRADED 0x03
|
||||||
#define OSD_INCOMPLETE 0x04
|
#define OBJ_INCOMPLETE 0x04
|
||||||
#define OSD_HALF_STABLE 0x10000
|
#define OBJ_NONSTABILIZED 0x10000
|
||||||
#define OSD_NEEDS_ROLLBACK 0x20000
|
#define OBJ_UNDERWRITTEN 0x20000
|
||||||
|
#define OBJ_OVERCOPIED 0x40000
|
||||||
|
#define OBJ_BUGGY 0x80000
|
||||||
|
|
||||||
class osd_t;
|
class osd_t;
|
||||||
|
|
||||||
struct osd_pg_peering_state_t
|
struct osd_pg_peering_state_t
|
||||||
{
|
{
|
||||||
osd_t* self;
|
osd_t* self;
|
||||||
|
// FIXME: add types for pg_num and osd_num?
|
||||||
uint64_t pg_num;
|
uint64_t pg_num;
|
||||||
std::unordered_map<uint64_t, osd_op_t*> list_ops;
|
std::unordered_map<uint64_t, osd_op_t*> list_ops;
|
||||||
int list_done = 0;
|
int list_done = 0;
|
||||||
|
@ -155,19 +180,52 @@ struct osd_pg_peering_state_t
|
||||||
struct osd_pg_t
|
struct osd_pg_t
|
||||||
{
|
{
|
||||||
int state;
|
int state;
|
||||||
|
uint64_t pg_size = 3, pg_minsize = 2;
|
||||||
uint64_t pg_num;
|
uint64_t pg_num;
|
||||||
uint64_t n_unfound = 0, n_degraded = 0, n_misplaced = 0;
|
// target_set = (role => osd_num). role starts from zero
|
||||||
std::vector<osd_pg_role_t> target_set;
|
std::vector<uint64_t> target_set;
|
||||||
// moved object map. by default, each object is considered to reside on the target_set.
|
// moved object map. by default, each object is considered to reside on the target_set.
|
||||||
// this map stores all objects that differ.
|
// this map stores all objects that differ.
|
||||||
// this map may consume up to ~ (raw storage / object size) * 24 bytes in the worst case scenario
|
// this map may consume up to ~ (raw storage / object size) * 24 bytes in the worst case scenario
|
||||||
// which is up to ~192 MB per 1 TB in the worst case scenario
|
// which is up to ~192 MB per 1 TB in the worst case scenario
|
||||||
std::unordered_map<osd_acting_set_t, int> acting_set_ids;
|
std::set<osd_obj_state_t> state_dict;
|
||||||
std::map<int, osd_acting_set_t> acting_sets;
|
spp::sparse_hash_map<object_id, const osd_obj_state_t*> obj_states;
|
||||||
spp::sparse_hash_map<object_id, int> object_map;
|
spp::sparse_hash_map<object_id, osd_ver_override_t> ver_override;
|
||||||
osd_pg_peering_state_t *peering_state = NULL;
|
osd_pg_peering_state_t *peering_state = NULL;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct obj_ver_role
|
||||||
|
{
|
||||||
|
object_id oid;
|
||||||
|
uint64_t version;
|
||||||
|
uint64_t osd_num;
|
||||||
|
bool is_stable;
|
||||||
|
};
|
||||||
|
|
||||||
|
inline bool operator < (const obj_ver_role & a, const obj_ver_role & b)
|
||||||
|
{
|
||||||
|
return a.oid < b.oid ||
|
||||||
|
// object versions go in descending order
|
||||||
|
a.oid == b.oid && a.version > b.version ||
|
||||||
|
a.oid == b.oid && a.version == b.version ||
|
||||||
|
a.oid == b.oid && a.version == b.version && a.osd_num < b.osd_num;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Max 64 replicas
|
||||||
|
#define STRIPE_MASK 0x3F
|
||||||
|
#define STRIPE_SHIFT 6
|
||||||
|
|
||||||
|
struct osd_obj_state_check_t
|
||||||
|
{
|
||||||
|
int start = 0;
|
||||||
|
object_id oid = { 0 };
|
||||||
|
uint64_t max_ver = 0;
|
||||||
|
uint64_t target_ver = 0;
|
||||||
|
uint64_t n_copies = 0, has_roles = 0, n_roles = 0, n_stable = 0, n_matched = 0;
|
||||||
|
bool is_buggy = false;
|
||||||
|
osd_obj_state_t state_obj;
|
||||||
|
};
|
||||||
|
|
||||||
struct osd_peer_def_t
|
struct osd_peer_def_t
|
||||||
{
|
{
|
||||||
uint64_t osd_num = 0;
|
uint64_t osd_num = 0;
|
||||||
|
@ -234,6 +292,8 @@ class osd_t
|
||||||
void init_primary();
|
void init_primary();
|
||||||
void handle_peers();
|
void handle_peers();
|
||||||
void start_pg_peering(int i);
|
void start_pg_peering(int i);
|
||||||
|
void calc_object_states(osd_pg_t &pg);
|
||||||
|
void remember_object(osd_pg_t &pg, osd_obj_state_check_t &st, std::vector<obj_ver_role> &all, int end);
|
||||||
|
|
||||||
// op execution
|
// op execution
|
||||||
void exec_op(osd_op_t *cur_op);
|
void exec_op(osd_op_t *cur_op);
|
||||||
|
|
164
osd_peering.cpp
164
osd_peering.cpp
|
@ -1,5 +1,8 @@
|
||||||
#include <netinet/tcp.h>
|
#include <netinet/tcp.h>
|
||||||
#include <sys/epoll.h>
|
#include <sys/epoll.h>
|
||||||
|
|
||||||
|
#include <algorithm>
|
||||||
|
|
||||||
#include "osd.h"
|
#include "osd.h"
|
||||||
|
|
||||||
void osd_t::init_primary()
|
void osd_t::init_primary()
|
||||||
|
@ -14,8 +17,9 @@ void osd_t::init_primary()
|
||||||
pgs.push_back((osd_pg_t){
|
pgs.push_back((osd_pg_t){
|
||||||
.state = PG_OFFLINE,
|
.state = PG_OFFLINE,
|
||||||
.pg_num = 1,
|
.pg_num = 1,
|
||||||
.target_set = { { .role = 1, .osd_num = 1 }, { .role = 2, .osd_num = 2 }, { .role = 3, .osd_num = 3 } },
|
.target_set = { 1, 2, 3 },
|
||||||
.object_map = spp::sparse_hash_map<object_id, int>(),
|
.obj_states = spp::sparse_hash_map<object_id, const osd_obj_state_t*>(),
|
||||||
|
.ver_override = spp::sparse_hash_map<object_id, osd_ver_override_t>(),
|
||||||
});
|
});
|
||||||
pg_count = 1;
|
pg_count = 1;
|
||||||
peering_state = 1;
|
peering_state = 1;
|
||||||
|
@ -144,9 +148,9 @@ void osd_t::handle_peers()
|
||||||
{
|
{
|
||||||
// Start PG peering
|
// Start PG peering
|
||||||
pgs[0].state = PG_PEERING;
|
pgs[0].state = PG_PEERING;
|
||||||
pgs[0].acting_set_ids.clear();
|
pgs[0].state_dict.clear();
|
||||||
pgs[0].acting_sets.clear();
|
pgs[0].obj_states.clear();
|
||||||
pgs[0].object_map.clear();
|
pgs[0].ver_override.clear();
|
||||||
if (pgs[0].peering_state)
|
if (pgs[0].peering_state)
|
||||||
delete pgs[0].peering_state;
|
delete pgs[0].peering_state;
|
||||||
peering_state = 2;
|
peering_state = 2;
|
||||||
|
@ -168,7 +172,7 @@ void osd_t::handle_peers()
|
||||||
}
|
}
|
||||||
else if (pgs[i].peering_state->list_done >= 3)
|
else if (pgs[i].peering_state->list_done >= 3)
|
||||||
{
|
{
|
||||||
// FIXME
|
calc_object_states(pgs[i]);
|
||||||
peering_state = 0;
|
peering_state = 0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -193,6 +197,9 @@ void osd_t::start_pg_peering(int pg_idx)
|
||||||
"Got object list from OSD %lu (local): %d objects (%lu of them stable)\n",
|
"Got object list from OSD %lu (local): %d objects (%lu of them stable)\n",
|
||||||
ps->self->osd_num, bs_op->retval, bs_op->version
|
ps->self->osd_num, bs_op->retval, bs_op->version
|
||||||
);
|
);
|
||||||
|
op->buf = op->bs_op.buf;
|
||||||
|
op->reply.hdr.retval = op->bs_op.retval;
|
||||||
|
op->reply.sec_list.stable_count = op->bs_op.version;
|
||||||
ps->list_done++;
|
ps->list_done++;
|
||||||
};
|
};
|
||||||
pg.peering_state->list_ops[osd_num] = op;
|
pg.peering_state->list_ops[osd_num] = op;
|
||||||
|
@ -228,3 +235,148 @@ void osd_t::start_pg_peering(int pg_idx)
|
||||||
outbox_push(cl, op);
|
outbox_push(cl, op);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void osd_t::remember_object(osd_pg_t &pg, osd_obj_state_check_t &st, std::vector<obj_ver_role> &all, int end)
|
||||||
|
{
|
||||||
|
// Remember the decision
|
||||||
|
uint64_t state = 0;
|
||||||
|
if (st.n_roles == pg.pg_size)
|
||||||
|
{
|
||||||
|
if (st.n_matched == pg.pg_size)
|
||||||
|
state = OBJ_CLEAN;
|
||||||
|
else
|
||||||
|
state = OBJ_MISPLACED;
|
||||||
|
}
|
||||||
|
else if (st.n_roles < pg.pg_minsize)
|
||||||
|
state = OBJ_INCOMPLETE;
|
||||||
|
else
|
||||||
|
state = OBJ_DEGRADED;
|
||||||
|
if (st.n_copies > pg.pg_size)
|
||||||
|
state |= OBJ_OVERCOPIED;
|
||||||
|
if (st.n_stable < st.n_copies)
|
||||||
|
state |= OBJ_NONSTABILIZED;
|
||||||
|
if (st.target_ver < st.max_ver)
|
||||||
|
state |= OBJ_UNDERWRITTEN;
|
||||||
|
if (st.is_buggy)
|
||||||
|
state |= OBJ_BUGGY;
|
||||||
|
if (state != OBJ_CLEAN)
|
||||||
|
{
|
||||||
|
st.state_obj.state = state;
|
||||||
|
st.state_obj.loc.clear();
|
||||||
|
for (int i = st.start; i < end; i++)
|
||||||
|
{
|
||||||
|
st.state_obj.loc.push_back((osd_obj_loc_t){
|
||||||
|
.role = (all[i].oid.stripe & STRIPE_MASK),
|
||||||
|
.osd_num = all[i].osd_num,
|
||||||
|
.stable = all[i].is_stable,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
std::sort(st.state_obj.loc.begin(), st.state_obj.loc.end());
|
||||||
|
auto ins = pg.state_dict.insert(st.state_obj);
|
||||||
|
pg.obj_states[st.oid] = &(*(ins.first));
|
||||||
|
if (state & OBJ_UNDERWRITTEN)
|
||||||
|
{
|
||||||
|
pg.ver_override[st.oid] = {
|
||||||
|
.max_ver = st.max_ver,
|
||||||
|
.target_ver = st.target_ver,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void osd_t::calc_object_states(osd_pg_t &pg)
|
||||||
|
{
|
||||||
|
// Copy all object lists into one array
|
||||||
|
std::vector<obj_ver_role> all;
|
||||||
|
auto ps = pg.peering_state;
|
||||||
|
for (auto e: ps->list_ops)
|
||||||
|
{
|
||||||
|
osd_op_t* op = e.second;
|
||||||
|
auto nstab = op->reply.sec_list.stable_count;
|
||||||
|
auto n = op->reply.hdr.retval;
|
||||||
|
auto osd_num = clients[op->peer_fd].osd_num;
|
||||||
|
all.resize(all.size() + n);
|
||||||
|
obj_ver_id *ov = (obj_ver_id*)op->buf;
|
||||||
|
for (uint64_t i = 0; i < n; i++, ov++)
|
||||||
|
{
|
||||||
|
all[i] = {
|
||||||
|
.oid = ov->oid,
|
||||||
|
.version = ov->version,
|
||||||
|
.osd_num = osd_num,
|
||||||
|
.is_stable = i < nstab,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
free(op->buf);
|
||||||
|
op->buf = NULL;
|
||||||
|
}
|
||||||
|
// Sort
|
||||||
|
std::sort(all.begin(), all.end());
|
||||||
|
// Walk over it and check object states
|
||||||
|
int replica = 0;
|
||||||
|
osd_obj_state_check_t st;
|
||||||
|
for (int i = 0; i < all.size(); i++)
|
||||||
|
{
|
||||||
|
if (st.oid.inode != all[i].oid.inode ||
|
||||||
|
st.oid.stripe != (all[i].oid.stripe >> STRIPE_SHIFT))
|
||||||
|
{
|
||||||
|
if (st.oid.inode != 0)
|
||||||
|
{
|
||||||
|
// Remember object state
|
||||||
|
remember_object(pg, st, all, i);
|
||||||
|
}
|
||||||
|
st.start = i;
|
||||||
|
st.oid = { .inode = all[i].oid.inode, .stripe = all[i].oid.stripe >> STRIPE_SHIFT };
|
||||||
|
st.max_ver = st.target_ver = all[i].version;
|
||||||
|
st.has_roles = st.n_copies = st.n_roles = st.n_stable = st.n_matched = 0;
|
||||||
|
st.is_buggy = false;
|
||||||
|
}
|
||||||
|
if (st.target_ver != all[i].version)
|
||||||
|
{
|
||||||
|
if (st.n_stable > 0 || st.n_roles >= pg.pg_minsize)
|
||||||
|
{
|
||||||
|
// Version is either recoverable or stable, choose it as target and skip previous versions
|
||||||
|
remember_object(pg, st, all, i);
|
||||||
|
while (i < all.size() && st.oid.inode == all[i].oid.inode &&
|
||||||
|
st.oid.stripe == (all[i].oid.stripe >> STRIPE_SHIFT))
|
||||||
|
{
|
||||||
|
i++;
|
||||||
|
}
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// Remember that there are newer unrecoverable versions
|
||||||
|
st.target_ver = all[i].version;
|
||||||
|
st.has_roles = st.n_copies = st.n_roles = st.n_stable = st.n_matched = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
replica = (all[i].oid.stripe & STRIPE_MASK);
|
||||||
|
st.n_copies++;
|
||||||
|
if (replica >= pg.pg_size)
|
||||||
|
{
|
||||||
|
// FIXME In the future, check it against the PG epoch number to handle replication factor/scheme changes
|
||||||
|
st.is_buggy = true;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
if (all[i].is_stable)
|
||||||
|
{
|
||||||
|
st.n_stable++;
|
||||||
|
}
|
||||||
|
else if (pg.target_set[replica] == all[i].osd_num)
|
||||||
|
{
|
||||||
|
st.n_matched++;
|
||||||
|
}
|
||||||
|
if (!(st.has_roles & (1 << replica)))
|
||||||
|
{
|
||||||
|
st.has_roles = st.has_roles | (1 << replica);
|
||||||
|
st.n_roles++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (st.oid.inode != 0)
|
||||||
|
{
|
||||||
|
// Remember object state
|
||||||
|
remember_object(pg, st, all, all.size());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
8
test.cpp
8
test.cpp
|
@ -344,7 +344,11 @@ int main(int argc, char *argv[])
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
printf("Sorting\n");
|
printf("Sorting\n");
|
||||||
// sort takes 7 s
|
// sorting the whole array takes 7 s
|
||||||
std::sort(to_sort.begin(), to_sort.end());
|
//std::sort(to_sort.begin(), to_sort.end());
|
||||||
|
// sorting in 3 parts... almost the same, 6 s
|
||||||
|
std::sort(to_sort.begin(), to_sort.begin() + to_sort.size()/3);
|
||||||
|
std::sort(to_sort.begin() + to_sort.size()/3, to_sort.begin() + to_sort.size()*2/3);
|
||||||
|
std::sort(to_sort.begin() + to_sort.size()*2/3, to_sort.end());
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue