Fix degraded object recovery (it seems to work now)

trace-sqes
Vitaliy Filippov 2020-03-25 02:17:12 +03:00
parent 7acfc95f75
commit c0a22d825d
5 changed files with 66 additions and 46 deletions

View File

@ -67,7 +67,7 @@ int blockstore_impl_t::dequeue_stable(blockstore_op_t *op)
else if (IS_UNSYNCED(dirty_it->second.state)) else if (IS_UNSYNCED(dirty_it->second.state))
{ {
// Object not synced yet. Caller must sync it first // Object not synced yet. Caller must sync it first
op->retval = EAGAIN; op->retval = -EAGAIN;
FINISH_OP(op); FINISH_OP(op);
return 1; return 1;
} }

12
osd.h
View File

@ -169,10 +169,10 @@ struct osd_object_id_t
struct osd_recovery_state_t struct osd_recovery_state_t
{ {
int st; int st = 0;
pg_num_t pg_num; pg_num_t pg_num = 0;
object_id oid; object_id oid = { 0 };
osd_op_t *op; osd_op_t *op = NULL;
}; };
class osd_t class osd_t
@ -197,7 +197,7 @@ class osd_t
int peering_state = 0; int peering_state = 0;
unsigned pg_count = 0; unsigned pg_count = 0;
uint64_t next_subop_id = 1; uint64_t next_subop_id = 1;
osd_recovery_state_t *recovery_state; osd_recovery_state_t recovery_state;
// Unstable writes // Unstable writes
std::map<osd_object_id_t, uint64_t> unstable_writes; std::map<osd_object_id_t, uint64_t> unstable_writes;
@ -276,7 +276,7 @@ class osd_t
void continue_primary_write(osd_op_t *cur_op); void continue_primary_write(osd_op_t *cur_op);
void continue_primary_sync(osd_op_t *cur_op); void continue_primary_sync(osd_op_t *cur_op);
void finish_op(osd_op_t *cur_op, int retval); void finish_op(osd_op_t *cur_op, int retval);
void handle_primary_subop(osd_op_t *cur_op, int ok, uint64_t version); void handle_primary_subop(uint64_t opcode, osd_op_t *cur_op, int ok, uint64_t version);
void submit_primary_subops(int submit_type, int read_pg_size, const uint64_t* osd_set, osd_op_t *cur_op); void submit_primary_subops(int submit_type, int read_pg_size, const uint64_t* osd_set, osd_op_t *cur_op);
void submit_primary_sync_subops(osd_op_t *cur_op); void submit_primary_sync_subops(osd_op_t *cur_op);
void submit_primary_stab_subops(osd_op_t *cur_op); void submit_primary_stab_subops(osd_op_t *cur_op);

View File

@ -190,67 +190,68 @@ void osd_t::submit_flush_op(pg_num_t pg_num, pg_flush_batch_t *fb, bool rollback
bool osd_t::continue_recovery() bool osd_t::continue_recovery()
{ {
pg_t *pg = NULL; pg_t *pg = NULL;
if (recovery_state->st == 0) goto resume_0; if (recovery_state.st == 0) goto resume_0;
else if (recovery_state->st == 1) goto resume_1; else if (recovery_state.st == 1) goto resume_1;
else if (recovery_state->st == 2) goto resume_2; else if (recovery_state.st == 2) goto resume_2;
else if (recovery_state->st == 3) goto resume_3; else if (recovery_state.st == 3) goto resume_3;
else if (recovery_state->st == 4) goto resume_4; else if (recovery_state.st == 4) goto resume_4;
resume_0: resume_0:
for (auto p: pgs) for (auto p: pgs)
{ {
if (p.second.state & PG_HAS_DEGRADED) if (p.second.state & PG_HAS_DEGRADED)
{ {
recovery_state->pg_num = p.first; recovery_state.pg_num = p.first;
goto resume_1; goto resume_1;
} }
} }
recovery_state->st = 0; recovery_state.st = 0;
return false; return false;
resume_1: resume_1:
pg = &pgs[recovery_state->pg_num]; pg = &pgs[recovery_state.pg_num];
if (!pg->degraded_objects.size()) if (!pg->degraded_objects.size())
{ {
pg->state = pg->state & ~PG_HAS_DEGRADED; pg->state = pg->state & ~PG_HAS_DEGRADED;
pg->print_state();
goto resume_0; goto resume_0;
} }
recovery_state->oid = pg->degraded_objects.begin()->first; recovery_state.oid = pg->degraded_objects.begin()->first;
recovery_state->op = new osd_op_t(); recovery_state.op = new osd_op_t();
recovery_state->op->op_type = OSD_OP_OUT; recovery_state.op->op_type = OSD_OP_OUT;
recovery_state->op->req = { recovery_state.op->req = {
.rw = { .rw = {
.header = { .header = {
.magic = SECONDARY_OSD_OP_MAGIC, .magic = SECONDARY_OSD_OP_MAGIC,
.id = 0, .id = 1,
.opcode = OSD_OP_WRITE, .opcode = OSD_OP_WRITE,
}, },
.inode = recovery_state->oid.inode, .inode = recovery_state.oid.inode,
.offset = recovery_state->oid.stripe, .offset = recovery_state.oid.stripe,
.len = 0, .len = 0,
}, },
}; };
recovery_state->op->callback = [this](osd_op_t *op) recovery_state.op->callback = [this](osd_op_t *op)
{ {
if (op->reply.hdr.retval < 0) if (op->reply.hdr.retval < 0)
recovery_state->st += 1; // error recovery_state.st += 1; // error
else else
recovery_state->st += 2; // ok recovery_state.st += 2; // ok
continue_recovery(); continue_recovery();
}; };
exec_op(recovery_state->op); exec_op(recovery_state.op);
recovery_state->st = 2; recovery_state.st = 2;
resume_2: resume_2:
return true; return true;
resume_3: resume_3:
// FIXME handle error // FIXME handle error
throw std::runtime_error("failed to recover an object"); throw std::runtime_error("failed to recover an object");
resume_4: resume_4:
delete recovery_state->op; delete recovery_state.op;
recovery_state->op = NULL; recovery_state.op = NULL;
// Don't sync the write, it will be synced by our regular sync coroutine // Don't sync the write, it will be synced by our regular sync coroutine
pg = &pgs[recovery_state->pg_num]; pg = &pgs[recovery_state.pg_num];
pg_osd_set_state_t *st; pg_osd_set_state_t *st;
{ {
auto st_it = pg->degraded_objects.find(recovery_state->oid); auto st_it = pg->degraded_objects.find(recovery_state.oid);
st = st_it->second; st = st_it->second;
pg->degraded_objects.erase(st_it); pg->degraded_objects.erase(st_it);
} }
@ -311,12 +312,12 @@ resume_4:
} }
new_st = &st_it->second; new_st = &st_it->second;
new_st->object_count++; new_st->object_count++;
pg->misplaced_objects[recovery_state->oid] = new_st; pg->misplaced_objects[recovery_state.oid] = new_st;
} }
if (!st->object_count) if (!st->object_count)
{ {
pg->state_dict.erase(st->osd_set); pg->state_dict.erase(st->osd_set);
} }
recovery_state->st = 0; recovery_state.st = 0;
goto resume_0; goto resume_0;
} }

View File

@ -177,9 +177,9 @@ void osd_t::handle_peers()
{ {
p.second.calc_object_states(); p.second.calc_object_states();
if (p.second.state & PG_HAS_UNCLEAN) if (p.second.state & PG_HAS_UNCLEAN)
{
peering_state = peering_state | OSD_FLUSHING_PGS; peering_state = peering_state | OSD_FLUSHING_PGS;
} else
peering_state = peering_state | OSD_RECOVERING;
} }
else else
{ {

View File

@ -233,7 +233,9 @@ void osd_t::submit_primary_subops(int submit_type, int pg_size, const uint64_t*
{ {
zero_read = -1; zero_read = -1;
} }
uint64_t op_version = w ? op_data->fact_ver+1 : (submit_type == SUBMIT_RMW_READ ? UINT64_MAX : op_data->target_ver);
osd_op_t *subops = new osd_op_t[n_subops]; osd_op_t *subops = new osd_op_t[n_subops];
op_data->fact_ver = 0;
op_data->done = op_data->errors = 0; op_data->done = op_data->errors = 0;
op_data->n_subops = n_subops; op_data->n_subops = n_subops;
op_data->subops = subops; op_data->subops = subops;
@ -254,13 +256,14 @@ void osd_t::submit_primary_subops(int submit_type, int pg_size, const uint64_t*
.opcode = (uint64_t)(w ? BS_OP_WRITE : BS_OP_READ), .opcode = (uint64_t)(w ? BS_OP_WRITE : BS_OP_READ),
.callback = [cur_op, this](blockstore_op_t *subop) .callback = [cur_op, this](blockstore_op_t *subop)
{ {
handle_primary_subop(cur_op, subop->retval == subop->len, subop->version); handle_primary_subop(subop->opcode == BS_OP_WRITE ? OSD_OP_SECONDARY_WRITE : OSD_OP_SECONDARY_READ,
cur_op, subop->retval == subop->len, subop->version);
}, },
.oid = { .oid = {
.inode = op_data->oid.inode, .inode = op_data->oid.inode,
.stripe = op_data->oid.stripe | role, .stripe = op_data->oid.stripe | role,
}, },
.version = w ? 0 : (submit_type == SUBMIT_RMW_READ ? UINT64_MAX : op_data->target_ver), .version = op_version,
.offset = w ? stripes[role].write_start : stripes[role].read_start, .offset = w ? stripes[role].write_start : stripes[role].read_start,
.len = w ? stripes[role].write_end - stripes[role].write_start : stripes[role].read_end - stripes[role].read_start, .len = w ? stripes[role].write_end - stripes[role].write_start : stripes[role].read_end - stripes[role].read_start,
.buf = w ? stripes[role].write_buf : stripes[role].read_buf, .buf = w ? stripes[role].write_buf : stripes[role].read_buf,
@ -282,7 +285,7 @@ void osd_t::submit_primary_subops(int submit_type, int pg_size, const uint64_t*
.inode = op_data->oid.inode, .inode = op_data->oid.inode,
.stripe = op_data->oid.stripe | role, .stripe = op_data->oid.stripe | role,
}, },
.version = w ? 0 : (submit_type == SUBMIT_RMW_READ ? UINT64_MAX : op_data->target_ver), .version = op_version,
.offset = w ? stripes[role].write_start : stripes[role].read_start, .offset = w ? stripes[role].write_start : stripes[role].read_start,
.len = w ? stripes[role].write_end - stripes[role].write_start : stripes[role].read_end - stripes[role].read_start, .len = w ? stripes[role].write_end - stripes[role].write_start : stripes[role].read_end - stripes[role].read_start,
}; };
@ -295,7 +298,7 @@ void osd_t::submit_primary_subops(int submit_type, int pg_size, const uint64_t*
{ {
// so it doesn't get freed // so it doesn't get freed
subop->buf = NULL; subop->buf = NULL;
handle_primary_subop(cur_op, subop->reply.hdr.retval == subop->req.sec_rw.len, subop->reply.sec_rw.version); handle_primary_subop(subop->req.hdr.opcode, cur_op, subop->reply.hdr.retval == subop->req.sec_rw.len, subop->reply.sec_rw.version);
}; };
outbox_push(clients[subops[subop].peer_fd], &subops[subop]); outbox_push(clients[subops[subop].peer_fd], &subops[subop]);
} }
@ -304,14 +307,22 @@ void osd_t::submit_primary_subops(int submit_type, int pg_size, const uint64_t*
} }
} }
void osd_t::handle_primary_subop(osd_op_t *cur_op, int ok, uint64_t version) void osd_t::handle_primary_subop(uint64_t opcode, osd_op_t *cur_op, int ok, uint64_t version)
{ {
osd_primary_op_data_t *op_data = cur_op->op_data; osd_primary_op_data_t *op_data = cur_op->op_data;
if (opcode == OSD_OP_SECONDARY_READ || opcode == OSD_OP_SECONDARY_WRITE)
{
if (op_data->fact_ver != 0 && op_data->fact_ver != version)
{
throw std::runtime_error("different fact_versions returned from subops: "+std::to_string(version)+" vs "+std::to_string(op_data->fact_ver));
}
op_data->fact_ver = version; op_data->fact_ver = version;
}
if (!ok) if (!ok)
{ {
// FIXME: Handle errors // FIXME: Handle errors
op_data->errors++; op_data->errors++;
throw std::runtime_error("subop error for op "+std::to_string(cur_op->req.hdr.opcode)+": "+std::to_string(op_data->st));
} }
else else
{ {
@ -413,6 +424,11 @@ void osd_t::continue_primary_write(osd_op_t *cur_op)
resume_8: resume_8:
return; return;
resume_9: resume_9:
for (int i = 0; i < pg.pg_minsize; i++)
{
op_data->stripes[i].read_start = 0;
op_data->stripes[i].read_end = 0;
}
memcpy( memcpy(
op_data->recovery_buf + cur_op->req.rw.offset - op_data->oid.stripe, op_data->recovery_buf + cur_op->req.rw.offset - op_data->oid.stripe,
cur_op->buf, cur_op->req.rw.len cur_op->buf, cur_op->req.rw.len
@ -420,8 +436,11 @@ resume_9:
free(cur_op->buf); free(cur_op->buf);
cur_op->buf = op_data->recovery_buf; cur_op->buf = op_data->recovery_buf;
op_data->recovery_buf = NULL; op_data->recovery_buf = NULL;
// Determine blocks to write, bypass RMW_READ
cur_op->rmw_buf = calc_rmw_reads(cur_op->buf, op_data->stripes, pg.cur_set.data(), pg.pg_size, pg.pg_minsize, pg.pg_cursize);
goto resume_3;
resume_1: resume_1:
// Determine blocks to read // Determine blocks to read and write
cur_op->rmw_buf = calc_rmw_reads(cur_op->buf, op_data->stripes, pg.cur_set.data(), pg.pg_size, pg.pg_minsize, pg.pg_cursize); cur_op->rmw_buf = calc_rmw_reads(cur_op->buf, op_data->stripes, pg.cur_set.data(), pg.pg_size, pg.pg_minsize, pg.pg_cursize);
// Read required blocks // Read required blocks
submit_primary_subops(SUBMIT_RMW_READ, pg.pg_size, pg.cur_set.data(), cur_op); submit_primary_subops(SUBMIT_RMW_READ, pg.pg_size, pg.cur_set.data(), cur_op);
@ -641,7 +660,7 @@ void osd_t::submit_primary_sync_subops(osd_op_t *cur_op)
.opcode = BS_OP_SYNC, .opcode = BS_OP_SYNC,
.callback = [cur_op, this](blockstore_op_t *subop) .callback = [cur_op, this](blockstore_op_t *subop)
{ {
handle_primary_subop(cur_op, subop->retval == 0, 0); handle_primary_subop(OSD_OP_SECONDARY_SYNC, cur_op, subop->retval == 0, 0);
}, },
}); });
bs->enqueue_op(subops[i].bs_op); bs->enqueue_op(subops[i].bs_op);
@ -660,7 +679,7 @@ void osd_t::submit_primary_sync_subops(osd_op_t *cur_op)
}; };
subops[i].callback = [cur_op, this](osd_op_t *subop) subops[i].callback = [cur_op, this](osd_op_t *subop)
{ {
handle_primary_subop(cur_op, subop->reply.hdr.retval == 0, 0); handle_primary_subop(OSD_OP_SECONDARY_SYNC, cur_op, subop->reply.hdr.retval == 0, 0);
}; };
outbox_push(clients[subops[i].peer_fd], &subops[i]); outbox_push(clients[subops[i].peer_fd], &subops[i]);
} }
@ -684,7 +703,7 @@ void osd_t::submit_primary_stab_subops(osd_op_t *cur_op)
.opcode = BS_OP_STABLE, .opcode = BS_OP_STABLE,
.callback = [cur_op, this](blockstore_op_t *subop) .callback = [cur_op, this](blockstore_op_t *subop)
{ {
handle_primary_subop(cur_op, subop->retval == 0, 0); handle_primary_subop(OSD_OP_SECONDARY_STABILIZE, cur_op, subop->retval == 0, 0);
}, },
.len = (uint32_t)stab_osd.len, .len = (uint32_t)stab_osd.len,
.buf = (void*)(op_data->unstable_writes + stab_osd.start), .buf = (void*)(op_data->unstable_writes + stab_osd.start),
@ -707,7 +726,7 @@ void osd_t::submit_primary_stab_subops(osd_op_t *cur_op)
subops[i].send_list.push_back(op_data->unstable_writes + stab_osd.start, stab_osd.len * sizeof(obj_ver_id)); subops[i].send_list.push_back(op_data->unstable_writes + stab_osd.start, stab_osd.len * sizeof(obj_ver_id));
subops[i].callback = [cur_op, this](osd_op_t *subop) subops[i].callback = [cur_op, this](osd_op_t *subop)
{ {
handle_primary_subop(cur_op, subop->reply.hdr.retval == 0, 0); handle_primary_subop(OSD_OP_SECONDARY_STABILIZE, cur_op, subop->reply.hdr.retval == 0, 0);
}; };
outbox_push(clients[subops[i].peer_fd], &subops[i]); outbox_push(clients[subops[i].peer_fd], &subops[i]);
} }