Fix degraded object recovery (it seems to work now)
parent
7acfc95f75
commit
c0a22d825d
|
@ -67,7 +67,7 @@ int blockstore_impl_t::dequeue_stable(blockstore_op_t *op)
|
||||||
else if (IS_UNSYNCED(dirty_it->second.state))
|
else if (IS_UNSYNCED(dirty_it->second.state))
|
||||||
{
|
{
|
||||||
// Object not synced yet. Caller must sync it first
|
// Object not synced yet. Caller must sync it first
|
||||||
op->retval = EAGAIN;
|
op->retval = -EAGAIN;
|
||||||
FINISH_OP(op);
|
FINISH_OP(op);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
12
osd.h
12
osd.h
|
@ -169,10 +169,10 @@ struct osd_object_id_t
|
||||||
|
|
||||||
struct osd_recovery_state_t
|
struct osd_recovery_state_t
|
||||||
{
|
{
|
||||||
int st;
|
int st = 0;
|
||||||
pg_num_t pg_num;
|
pg_num_t pg_num = 0;
|
||||||
object_id oid;
|
object_id oid = { 0 };
|
||||||
osd_op_t *op;
|
osd_op_t *op = NULL;
|
||||||
};
|
};
|
||||||
|
|
||||||
class osd_t
|
class osd_t
|
||||||
|
@ -197,7 +197,7 @@ class osd_t
|
||||||
int peering_state = 0;
|
int peering_state = 0;
|
||||||
unsigned pg_count = 0;
|
unsigned pg_count = 0;
|
||||||
uint64_t next_subop_id = 1;
|
uint64_t next_subop_id = 1;
|
||||||
osd_recovery_state_t *recovery_state;
|
osd_recovery_state_t recovery_state;
|
||||||
|
|
||||||
// Unstable writes
|
// Unstable writes
|
||||||
std::map<osd_object_id_t, uint64_t> unstable_writes;
|
std::map<osd_object_id_t, uint64_t> unstable_writes;
|
||||||
|
@ -276,7 +276,7 @@ class osd_t
|
||||||
void continue_primary_write(osd_op_t *cur_op);
|
void continue_primary_write(osd_op_t *cur_op);
|
||||||
void continue_primary_sync(osd_op_t *cur_op);
|
void continue_primary_sync(osd_op_t *cur_op);
|
||||||
void finish_op(osd_op_t *cur_op, int retval);
|
void finish_op(osd_op_t *cur_op, int retval);
|
||||||
void handle_primary_subop(osd_op_t *cur_op, int ok, uint64_t version);
|
void handle_primary_subop(uint64_t opcode, osd_op_t *cur_op, int ok, uint64_t version);
|
||||||
void submit_primary_subops(int submit_type, int read_pg_size, const uint64_t* osd_set, osd_op_t *cur_op);
|
void submit_primary_subops(int submit_type, int read_pg_size, const uint64_t* osd_set, osd_op_t *cur_op);
|
||||||
void submit_primary_sync_subops(osd_op_t *cur_op);
|
void submit_primary_sync_subops(osd_op_t *cur_op);
|
||||||
void submit_primary_stab_subops(osd_op_t *cur_op);
|
void submit_primary_stab_subops(osd_op_t *cur_op);
|
||||||
|
|
|
@ -190,67 +190,68 @@ void osd_t::submit_flush_op(pg_num_t pg_num, pg_flush_batch_t *fb, bool rollback
|
||||||
bool osd_t::continue_recovery()
|
bool osd_t::continue_recovery()
|
||||||
{
|
{
|
||||||
pg_t *pg = NULL;
|
pg_t *pg = NULL;
|
||||||
if (recovery_state->st == 0) goto resume_0;
|
if (recovery_state.st == 0) goto resume_0;
|
||||||
else if (recovery_state->st == 1) goto resume_1;
|
else if (recovery_state.st == 1) goto resume_1;
|
||||||
else if (recovery_state->st == 2) goto resume_2;
|
else if (recovery_state.st == 2) goto resume_2;
|
||||||
else if (recovery_state->st == 3) goto resume_3;
|
else if (recovery_state.st == 3) goto resume_3;
|
||||||
else if (recovery_state->st == 4) goto resume_4;
|
else if (recovery_state.st == 4) goto resume_4;
|
||||||
resume_0:
|
resume_0:
|
||||||
for (auto p: pgs)
|
for (auto p: pgs)
|
||||||
{
|
{
|
||||||
if (p.second.state & PG_HAS_DEGRADED)
|
if (p.second.state & PG_HAS_DEGRADED)
|
||||||
{
|
{
|
||||||
recovery_state->pg_num = p.first;
|
recovery_state.pg_num = p.first;
|
||||||
goto resume_1;
|
goto resume_1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
recovery_state->st = 0;
|
recovery_state.st = 0;
|
||||||
return false;
|
return false;
|
||||||
resume_1:
|
resume_1:
|
||||||
pg = &pgs[recovery_state->pg_num];
|
pg = &pgs[recovery_state.pg_num];
|
||||||
if (!pg->degraded_objects.size())
|
if (!pg->degraded_objects.size())
|
||||||
{
|
{
|
||||||
pg->state = pg->state & ~PG_HAS_DEGRADED;
|
pg->state = pg->state & ~PG_HAS_DEGRADED;
|
||||||
|
pg->print_state();
|
||||||
goto resume_0;
|
goto resume_0;
|
||||||
}
|
}
|
||||||
recovery_state->oid = pg->degraded_objects.begin()->first;
|
recovery_state.oid = pg->degraded_objects.begin()->first;
|
||||||
recovery_state->op = new osd_op_t();
|
recovery_state.op = new osd_op_t();
|
||||||
recovery_state->op->op_type = OSD_OP_OUT;
|
recovery_state.op->op_type = OSD_OP_OUT;
|
||||||
recovery_state->op->req = {
|
recovery_state.op->req = {
|
||||||
.rw = {
|
.rw = {
|
||||||
.header = {
|
.header = {
|
||||||
.magic = SECONDARY_OSD_OP_MAGIC,
|
.magic = SECONDARY_OSD_OP_MAGIC,
|
||||||
.id = 0,
|
.id = 1,
|
||||||
.opcode = OSD_OP_WRITE,
|
.opcode = OSD_OP_WRITE,
|
||||||
},
|
},
|
||||||
.inode = recovery_state->oid.inode,
|
.inode = recovery_state.oid.inode,
|
||||||
.offset = recovery_state->oid.stripe,
|
.offset = recovery_state.oid.stripe,
|
||||||
.len = 0,
|
.len = 0,
|
||||||
},
|
},
|
||||||
};
|
};
|
||||||
recovery_state->op->callback = [this](osd_op_t *op)
|
recovery_state.op->callback = [this](osd_op_t *op)
|
||||||
{
|
{
|
||||||
if (op->reply.hdr.retval < 0)
|
if (op->reply.hdr.retval < 0)
|
||||||
recovery_state->st += 1; // error
|
recovery_state.st += 1; // error
|
||||||
else
|
else
|
||||||
recovery_state->st += 2; // ok
|
recovery_state.st += 2; // ok
|
||||||
continue_recovery();
|
continue_recovery();
|
||||||
};
|
};
|
||||||
exec_op(recovery_state->op);
|
exec_op(recovery_state.op);
|
||||||
recovery_state->st = 2;
|
recovery_state.st = 2;
|
||||||
resume_2:
|
resume_2:
|
||||||
return true;
|
return true;
|
||||||
resume_3:
|
resume_3:
|
||||||
// FIXME handle error
|
// FIXME handle error
|
||||||
throw std::runtime_error("failed to recover an object");
|
throw std::runtime_error("failed to recover an object");
|
||||||
resume_4:
|
resume_4:
|
||||||
delete recovery_state->op;
|
delete recovery_state.op;
|
||||||
recovery_state->op = NULL;
|
recovery_state.op = NULL;
|
||||||
// Don't sync the write, it will be synced by our regular sync coroutine
|
// Don't sync the write, it will be synced by our regular sync coroutine
|
||||||
pg = &pgs[recovery_state->pg_num];
|
pg = &pgs[recovery_state.pg_num];
|
||||||
pg_osd_set_state_t *st;
|
pg_osd_set_state_t *st;
|
||||||
{
|
{
|
||||||
auto st_it = pg->degraded_objects.find(recovery_state->oid);
|
auto st_it = pg->degraded_objects.find(recovery_state.oid);
|
||||||
st = st_it->second;
|
st = st_it->second;
|
||||||
pg->degraded_objects.erase(st_it);
|
pg->degraded_objects.erase(st_it);
|
||||||
}
|
}
|
||||||
|
@ -311,12 +312,12 @@ resume_4:
|
||||||
}
|
}
|
||||||
new_st = &st_it->second;
|
new_st = &st_it->second;
|
||||||
new_st->object_count++;
|
new_st->object_count++;
|
||||||
pg->misplaced_objects[recovery_state->oid] = new_st;
|
pg->misplaced_objects[recovery_state.oid] = new_st;
|
||||||
}
|
}
|
||||||
if (!st->object_count)
|
if (!st->object_count)
|
||||||
{
|
{
|
||||||
pg->state_dict.erase(st->osd_set);
|
pg->state_dict.erase(st->osd_set);
|
||||||
}
|
}
|
||||||
recovery_state->st = 0;
|
recovery_state.st = 0;
|
||||||
goto resume_0;
|
goto resume_0;
|
||||||
}
|
}
|
||||||
|
|
|
@ -177,9 +177,9 @@ void osd_t::handle_peers()
|
||||||
{
|
{
|
||||||
p.second.calc_object_states();
|
p.second.calc_object_states();
|
||||||
if (p.second.state & PG_HAS_UNCLEAN)
|
if (p.second.state & PG_HAS_UNCLEAN)
|
||||||
{
|
|
||||||
peering_state = peering_state | OSD_FLUSHING_PGS;
|
peering_state = peering_state | OSD_FLUSHING_PGS;
|
||||||
}
|
else
|
||||||
|
peering_state = peering_state | OSD_RECOVERING;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
|
|
@ -233,7 +233,9 @@ void osd_t::submit_primary_subops(int submit_type, int pg_size, const uint64_t*
|
||||||
{
|
{
|
||||||
zero_read = -1;
|
zero_read = -1;
|
||||||
}
|
}
|
||||||
|
uint64_t op_version = w ? op_data->fact_ver+1 : (submit_type == SUBMIT_RMW_READ ? UINT64_MAX : op_data->target_ver);
|
||||||
osd_op_t *subops = new osd_op_t[n_subops];
|
osd_op_t *subops = new osd_op_t[n_subops];
|
||||||
|
op_data->fact_ver = 0;
|
||||||
op_data->done = op_data->errors = 0;
|
op_data->done = op_data->errors = 0;
|
||||||
op_data->n_subops = n_subops;
|
op_data->n_subops = n_subops;
|
||||||
op_data->subops = subops;
|
op_data->subops = subops;
|
||||||
|
@ -254,13 +256,14 @@ void osd_t::submit_primary_subops(int submit_type, int pg_size, const uint64_t*
|
||||||
.opcode = (uint64_t)(w ? BS_OP_WRITE : BS_OP_READ),
|
.opcode = (uint64_t)(w ? BS_OP_WRITE : BS_OP_READ),
|
||||||
.callback = [cur_op, this](blockstore_op_t *subop)
|
.callback = [cur_op, this](blockstore_op_t *subop)
|
||||||
{
|
{
|
||||||
handle_primary_subop(cur_op, subop->retval == subop->len, subop->version);
|
handle_primary_subop(subop->opcode == BS_OP_WRITE ? OSD_OP_SECONDARY_WRITE : OSD_OP_SECONDARY_READ,
|
||||||
|
cur_op, subop->retval == subop->len, subop->version);
|
||||||
},
|
},
|
||||||
.oid = {
|
.oid = {
|
||||||
.inode = op_data->oid.inode,
|
.inode = op_data->oid.inode,
|
||||||
.stripe = op_data->oid.stripe | role,
|
.stripe = op_data->oid.stripe | role,
|
||||||
},
|
},
|
||||||
.version = w ? 0 : (submit_type == SUBMIT_RMW_READ ? UINT64_MAX : op_data->target_ver),
|
.version = op_version,
|
||||||
.offset = w ? stripes[role].write_start : stripes[role].read_start,
|
.offset = w ? stripes[role].write_start : stripes[role].read_start,
|
||||||
.len = w ? stripes[role].write_end - stripes[role].write_start : stripes[role].read_end - stripes[role].read_start,
|
.len = w ? stripes[role].write_end - stripes[role].write_start : stripes[role].read_end - stripes[role].read_start,
|
||||||
.buf = w ? stripes[role].write_buf : stripes[role].read_buf,
|
.buf = w ? stripes[role].write_buf : stripes[role].read_buf,
|
||||||
|
@ -282,7 +285,7 @@ void osd_t::submit_primary_subops(int submit_type, int pg_size, const uint64_t*
|
||||||
.inode = op_data->oid.inode,
|
.inode = op_data->oid.inode,
|
||||||
.stripe = op_data->oid.stripe | role,
|
.stripe = op_data->oid.stripe | role,
|
||||||
},
|
},
|
||||||
.version = w ? 0 : (submit_type == SUBMIT_RMW_READ ? UINT64_MAX : op_data->target_ver),
|
.version = op_version,
|
||||||
.offset = w ? stripes[role].write_start : stripes[role].read_start,
|
.offset = w ? stripes[role].write_start : stripes[role].read_start,
|
||||||
.len = w ? stripes[role].write_end - stripes[role].write_start : stripes[role].read_end - stripes[role].read_start,
|
.len = w ? stripes[role].write_end - stripes[role].write_start : stripes[role].read_end - stripes[role].read_start,
|
||||||
};
|
};
|
||||||
|
@ -295,7 +298,7 @@ void osd_t::submit_primary_subops(int submit_type, int pg_size, const uint64_t*
|
||||||
{
|
{
|
||||||
// so it doesn't get freed
|
// so it doesn't get freed
|
||||||
subop->buf = NULL;
|
subop->buf = NULL;
|
||||||
handle_primary_subop(cur_op, subop->reply.hdr.retval == subop->req.sec_rw.len, subop->reply.sec_rw.version);
|
handle_primary_subop(subop->req.hdr.opcode, cur_op, subop->reply.hdr.retval == subop->req.sec_rw.len, subop->reply.sec_rw.version);
|
||||||
};
|
};
|
||||||
outbox_push(clients[subops[subop].peer_fd], &subops[subop]);
|
outbox_push(clients[subops[subop].peer_fd], &subops[subop]);
|
||||||
}
|
}
|
||||||
|
@ -304,14 +307,22 @@ void osd_t::submit_primary_subops(int submit_type, int pg_size, const uint64_t*
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void osd_t::handle_primary_subop(osd_op_t *cur_op, int ok, uint64_t version)
|
void osd_t::handle_primary_subop(uint64_t opcode, osd_op_t *cur_op, int ok, uint64_t version)
|
||||||
{
|
{
|
||||||
osd_primary_op_data_t *op_data = cur_op->op_data;
|
osd_primary_op_data_t *op_data = cur_op->op_data;
|
||||||
|
if (opcode == OSD_OP_SECONDARY_READ || opcode == OSD_OP_SECONDARY_WRITE)
|
||||||
|
{
|
||||||
|
if (op_data->fact_ver != 0 && op_data->fact_ver != version)
|
||||||
|
{
|
||||||
|
throw std::runtime_error("different fact_versions returned from subops: "+std::to_string(version)+" vs "+std::to_string(op_data->fact_ver));
|
||||||
|
}
|
||||||
op_data->fact_ver = version;
|
op_data->fact_ver = version;
|
||||||
|
}
|
||||||
if (!ok)
|
if (!ok)
|
||||||
{
|
{
|
||||||
// FIXME: Handle errors
|
// FIXME: Handle errors
|
||||||
op_data->errors++;
|
op_data->errors++;
|
||||||
|
throw std::runtime_error("subop error for op "+std::to_string(cur_op->req.hdr.opcode)+": "+std::to_string(op_data->st));
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
|
@ -413,6 +424,11 @@ void osd_t::continue_primary_write(osd_op_t *cur_op)
|
||||||
resume_8:
|
resume_8:
|
||||||
return;
|
return;
|
||||||
resume_9:
|
resume_9:
|
||||||
|
for (int i = 0; i < pg.pg_minsize; i++)
|
||||||
|
{
|
||||||
|
op_data->stripes[i].read_start = 0;
|
||||||
|
op_data->stripes[i].read_end = 0;
|
||||||
|
}
|
||||||
memcpy(
|
memcpy(
|
||||||
op_data->recovery_buf + cur_op->req.rw.offset - op_data->oid.stripe,
|
op_data->recovery_buf + cur_op->req.rw.offset - op_data->oid.stripe,
|
||||||
cur_op->buf, cur_op->req.rw.len
|
cur_op->buf, cur_op->req.rw.len
|
||||||
|
@ -420,8 +436,11 @@ resume_9:
|
||||||
free(cur_op->buf);
|
free(cur_op->buf);
|
||||||
cur_op->buf = op_data->recovery_buf;
|
cur_op->buf = op_data->recovery_buf;
|
||||||
op_data->recovery_buf = NULL;
|
op_data->recovery_buf = NULL;
|
||||||
|
// Determine blocks to write, bypass RMW_READ
|
||||||
|
cur_op->rmw_buf = calc_rmw_reads(cur_op->buf, op_data->stripes, pg.cur_set.data(), pg.pg_size, pg.pg_minsize, pg.pg_cursize);
|
||||||
|
goto resume_3;
|
||||||
resume_1:
|
resume_1:
|
||||||
// Determine blocks to read
|
// Determine blocks to read and write
|
||||||
cur_op->rmw_buf = calc_rmw_reads(cur_op->buf, op_data->stripes, pg.cur_set.data(), pg.pg_size, pg.pg_minsize, pg.pg_cursize);
|
cur_op->rmw_buf = calc_rmw_reads(cur_op->buf, op_data->stripes, pg.cur_set.data(), pg.pg_size, pg.pg_minsize, pg.pg_cursize);
|
||||||
// Read required blocks
|
// Read required blocks
|
||||||
submit_primary_subops(SUBMIT_RMW_READ, pg.pg_size, pg.cur_set.data(), cur_op);
|
submit_primary_subops(SUBMIT_RMW_READ, pg.pg_size, pg.cur_set.data(), cur_op);
|
||||||
|
@ -641,7 +660,7 @@ void osd_t::submit_primary_sync_subops(osd_op_t *cur_op)
|
||||||
.opcode = BS_OP_SYNC,
|
.opcode = BS_OP_SYNC,
|
||||||
.callback = [cur_op, this](blockstore_op_t *subop)
|
.callback = [cur_op, this](blockstore_op_t *subop)
|
||||||
{
|
{
|
||||||
handle_primary_subop(cur_op, subop->retval == 0, 0);
|
handle_primary_subop(OSD_OP_SECONDARY_SYNC, cur_op, subop->retval == 0, 0);
|
||||||
},
|
},
|
||||||
});
|
});
|
||||||
bs->enqueue_op(subops[i].bs_op);
|
bs->enqueue_op(subops[i].bs_op);
|
||||||
|
@ -660,7 +679,7 @@ void osd_t::submit_primary_sync_subops(osd_op_t *cur_op)
|
||||||
};
|
};
|
||||||
subops[i].callback = [cur_op, this](osd_op_t *subop)
|
subops[i].callback = [cur_op, this](osd_op_t *subop)
|
||||||
{
|
{
|
||||||
handle_primary_subop(cur_op, subop->reply.hdr.retval == 0, 0);
|
handle_primary_subop(OSD_OP_SECONDARY_SYNC, cur_op, subop->reply.hdr.retval == 0, 0);
|
||||||
};
|
};
|
||||||
outbox_push(clients[subops[i].peer_fd], &subops[i]);
|
outbox_push(clients[subops[i].peer_fd], &subops[i]);
|
||||||
}
|
}
|
||||||
|
@ -684,7 +703,7 @@ void osd_t::submit_primary_stab_subops(osd_op_t *cur_op)
|
||||||
.opcode = BS_OP_STABLE,
|
.opcode = BS_OP_STABLE,
|
||||||
.callback = [cur_op, this](blockstore_op_t *subop)
|
.callback = [cur_op, this](blockstore_op_t *subop)
|
||||||
{
|
{
|
||||||
handle_primary_subop(cur_op, subop->retval == 0, 0);
|
handle_primary_subop(OSD_OP_SECONDARY_STABILIZE, cur_op, subop->retval == 0, 0);
|
||||||
},
|
},
|
||||||
.len = (uint32_t)stab_osd.len,
|
.len = (uint32_t)stab_osd.len,
|
||||||
.buf = (void*)(op_data->unstable_writes + stab_osd.start),
|
.buf = (void*)(op_data->unstable_writes + stab_osd.start),
|
||||||
|
@ -707,7 +726,7 @@ void osd_t::submit_primary_stab_subops(osd_op_t *cur_op)
|
||||||
subops[i].send_list.push_back(op_data->unstable_writes + stab_osd.start, stab_osd.len * sizeof(obj_ver_id));
|
subops[i].send_list.push_back(op_data->unstable_writes + stab_osd.start, stab_osd.len * sizeof(obj_ver_id));
|
||||||
subops[i].callback = [cur_op, this](osd_op_t *subop)
|
subops[i].callback = [cur_op, this](osd_op_t *subop)
|
||||||
{
|
{
|
||||||
handle_primary_subop(cur_op, subop->reply.hdr.retval == 0, 0);
|
handle_primary_subop(OSD_OP_SECONDARY_STABILIZE, cur_op, subop->reply.hdr.retval == 0, 0);
|
||||||
};
|
};
|
||||||
outbox_push(clients[subops[i].peer_fd], &subops[i]);
|
outbox_push(clients[subops[i].peer_fd], &subops[i]);
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue