Fix crashes, print some stats

Notably:
- fix the `delete op` inside lambda callback crash (it frees the lambda itself
  which results in use-after-free with g++)
- fix stop_client() reenterability
- fix a bug in the blockstore layer which resulted in always returning version=0
  for zero-length reads
- change error codes for blockstore_stabilize
trace-sqes
Vitaliy Filippov 2020-03-31 17:50:50 +03:00
parent 92c800bb64
commit 0f43f6d3f6
14 changed files with 118 additions and 78 deletions

View File

@ -93,7 +93,7 @@ Input:
- buf = pre-allocated obj_ver_id array <len> units long
Output:
- retval = 0 or negative error number (-EINVAL or -EBUSY if not synced)
- retval = 0 or negative error number (-EINVAL, -ENOENT if no such version or -EBUSY if not synced)
## BS_OP_SYNC_STAB_ALL

View File

@ -322,7 +322,7 @@ void blockstore_impl_t::enqueue_op(blockstore_op_t *op, bool first)
{
// Basic verification not passed
op->retval = -EINVAL;
op->callback(op);
std::function<void (blockstore_op_t*)>(op->callback)(op);
return;
}
if (op->opcode == BS_OP_SYNC_STAB_ALL)
@ -365,13 +365,13 @@ void blockstore_impl_t::enqueue_op(blockstore_op_t *op, bool first)
}
if (op->opcode == BS_OP_WRITE && !enqueue_write(op))
{
op->callback(op);
std::function<void (blockstore_op_t*)>(op->callback)(op);
return;
}
if (op->opcode == BS_OP_SYNC && immediate_commit == IMMEDIATE_ALL)
{
op->retval = 0;
op->callback(op);
std::function<void (blockstore_op_t*)>(op->callback)(op);
return;
}
// Call constructor without allocating memory. We'll call destructor before returning op back

View File

@ -141,7 +141,7 @@ struct fulfill_read_t
};
#define PRIV(op) ((blockstore_op_private_t*)(op)->private_data)
#define FINISH_OP(op) PRIV(op)->~blockstore_op_private_t(); op->callback(op)
#define FINISH_OP(op) PRIV(op)->~blockstore_op_private_t(); std::function<void (blockstore_op_t*)>(op->callback)(op)
struct blockstore_op_private_t
{

View File

@ -131,63 +131,66 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
dirty_it--;
}
}
if (clean_it != clean_db.end() && fulfilled < read_op->len)
if (clean_it != clean_db.end())
{
if (!result_version)
{
result_version = clean_it->second.version;
}
if (!clean_entry_bitmap_size)
if (fulfilled < read_op->len)
{
if (!fulfill_read(read_op, fulfilled, 0, block_size, ST_CURRENT, 0, clean_it->second.location))
if (!clean_entry_bitmap_size)
{
// need to wait. undo added requests, don't dequeue op
PRIV(read_op)->read_vec.clear();
return 0;
}
}
else
{
uint64_t meta_loc = clean_it->second.location >> block_order;
uint8_t *clean_entry_bitmap;
if (inmemory_meta)
{
uint64_t sector = (meta_loc / (meta_block_size / clean_entry_size)) * meta_block_size;
uint64_t pos = (meta_loc % (meta_block_size / clean_entry_size));
clean_entry_bitmap = (uint8_t*)(metadata_buffer + sector + pos*clean_entry_size + sizeof(clean_disk_entry));
if (!fulfill_read(read_op, fulfilled, 0, block_size, ST_CURRENT, 0, clean_it->second.location))
{
// need to wait. undo added requests, don't dequeue op
PRIV(read_op)->read_vec.clear();
return 0;
}
}
else
{
clean_entry_bitmap = (uint8_t*)(clean_bitmap + meta_loc*clean_entry_bitmap_size);
}
uint64_t bmp_start = 0, bmp_end = 0, bmp_size = block_size/bitmap_granularity;
while (bmp_start < bmp_size)
{
while (!(clean_entry_bitmap[bmp_end >> 3] & (1 << (bmp_end & 0x7))) && bmp_end < bmp_size)
uint64_t meta_loc = clean_it->second.location >> block_order;
uint8_t *clean_entry_bitmap;
if (inmemory_meta)
{
bmp_end++;
uint64_t sector = (meta_loc / (meta_block_size / clean_entry_size)) * meta_block_size;
uint64_t pos = (meta_loc % (meta_block_size / clean_entry_size));
clean_entry_bitmap = (uint8_t*)(metadata_buffer + sector + pos*clean_entry_size + sizeof(clean_disk_entry));
}
if (bmp_end > bmp_start)
else
{
// fill with zeroes
fulfill_read(read_op, fulfilled, bmp_start * bitmap_granularity,
bmp_end * bitmap_granularity, ST_DEL_STABLE, 0, 0);
clean_entry_bitmap = (uint8_t*)(clean_bitmap + meta_loc*clean_entry_bitmap_size);
}
bmp_start = bmp_end;
while (clean_entry_bitmap[bmp_end >> 3] & (1 << (bmp_end & 0x7)) && bmp_end < bmp_size)
uint64_t bmp_start = 0, bmp_end = 0, bmp_size = block_size/bitmap_granularity;
while (bmp_start < bmp_size)
{
bmp_end++;
}
if (bmp_end > bmp_start)
{
if (!fulfill_read(read_op, fulfilled, bmp_start * bitmap_granularity,
bmp_end * bitmap_granularity, ST_CURRENT, 0, clean_it->second.location + bmp_start * bitmap_granularity))
while (!(clean_entry_bitmap[bmp_end >> 3] & (1 << (bmp_end & 0x7))) && bmp_end < bmp_size)
{
// need to wait. undo added requests, don't dequeue op
PRIV(read_op)->read_vec.clear();
return 0;
bmp_end++;
}
if (bmp_end > bmp_start)
{
// fill with zeroes
fulfill_read(read_op, fulfilled, bmp_start * bitmap_granularity,
bmp_end * bitmap_granularity, ST_DEL_STABLE, 0, 0);
}
bmp_start = bmp_end;
while (clean_entry_bitmap[bmp_end >> 3] & (1 << (bmp_end & 0x7)) && bmp_end < bmp_size)
{
bmp_end++;
}
if (bmp_end > bmp_start)
{
if (!fulfill_read(read_op, fulfilled, bmp_start * bitmap_granularity,
bmp_end * bitmap_granularity, ST_CURRENT, 0, clean_it->second.location + bmp_start * bitmap_granularity))
{
// need to wait. undo added requests, don't dequeue op
PRIV(read_op)->read_vec.clear();
return 0;
}
bmp_start = bmp_end;
}
}
}
}

View File

@ -24,7 +24,7 @@ int blockstore_impl_t::dequeue_rollback(blockstore_op_t *op)
// FIXME Skip this object version
}
bad_op:
op->retval = -EINVAL;
op->retval = -ENOENT;
FINISH_OP(op);
return 1;
}

View File

@ -55,7 +55,7 @@ int blockstore_impl_t::dequeue_stable(blockstore_op_t *op)
if (clean_it == clean_db.end() || clean_it->second.version < v->version)
{
// No such object version
op->retval = -EINVAL;
op->retval = -ENOENT;
FINISH_OP(op);
return 1;
}

View File

@ -42,7 +42,7 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
else if (op->version < version)
{
// Invalid version requested
op->retval = -EINVAL;
op->retval = -EEXIST;
return false;
}
if (deleted && is_del)

42
osd.cpp
View File

@ -7,7 +7,7 @@
#include "osd.h"
static const char* osd_op_names[] = {
const char* osd_op_names[] = {
"",
"read",
"write",
@ -54,6 +54,18 @@ osd_t::osd_t(blockstore_config_t & config, blockstore_t *bs, ring_loop_t *ringlo
send_stat_count = 0;
send_stat_sum = 0;
}
if (incomplete_objects > 0)
{
printf("%lu object(s) incomplete\n", incomplete_objects);
}
if (degraded_objects > 0)
{
printf("%lu object(s) degraded\n", degraded_objects);
}
if (misplaced_objects > 0)
{
printf("%lu object(s) misplaced\n", misplaced_objects);
}
});
this->bs_block_size = bs->get_block_size();
// FIXME: use bitmap granularity instead
@ -301,7 +313,8 @@ void osd_t::cancel_op(osd_op_t *op)
op->reply.hdr.id = op->req.hdr.id;
op->reply.hdr.opcode = op->req.hdr.opcode;
op->reply.hdr.retval = -EPIPE;
op->callback(op);
// Copy lambda to be unaffected by `delete op`
std::function<void(osd_op_t*)>(op->callback)(op);
}
else
{
@ -316,7 +329,16 @@ void osd_t::stop_client(int peer_fd)
{
return;
}
auto & cl = it->second;
osd_client_t cl = it->second;
if (cl.osd_num)
{
printf("[%lu] Stopping client %d (OSD peer %lu)\n", osd_num, peer_fd, cl.osd_num);
}
else
{
printf("[%lu] Stopping client %d (regular client)\n", osd_num, peer_fd);
}
clients.erase(it);
if (epoll_ctl(epoll_fd, EPOLL_CTL_DEL, peer_fd, NULL) < 0)
{
throw std::runtime_error(std::string("epoll_ctl: ") + strerror(errno));
@ -350,7 +372,6 @@ void osd_t::stop_client(int peer_fd)
}
}
free(cl.in_buf);
clients.erase(it);
close(peer_fd);
}
@ -372,18 +393,7 @@ void osd_t::exec_op(osd_op_t *cur_op)
(cur_op->req.rw.len > OSD_RW_MAX || cur_op->req.rw.len % OSD_RW_ALIGN || cur_op->req.rw.offset % OSD_RW_ALIGN))
{
// Bad command
cur_op->reply.hdr.magic = SECONDARY_OSD_REPLY_MAGIC;
cur_op->reply.hdr.id = cur_op->req.hdr.id;
cur_op->reply.hdr.opcode = cur_op->req.hdr.opcode;
cur_op->reply.hdr.retval = -EINVAL;
if (cur_op->peer_fd)
{
outbox_push(this->clients[cur_op->peer_fd], cur_op);
}
else
{
cur_op->callback(cur_op);
}
finish_op(cur_op, -EINVAL);
return;
}
inflight_ops++;

3
osd.h
View File

@ -44,6 +44,8 @@
//#define OSD_STUB
extern const char* osd_op_names[];
struct osd_op_buf_list_t
{
int count = 0, alloc = 0, sent = 0;
@ -194,6 +196,7 @@ class osd_t
std::map<uint64_t, int> osd_peer_fds;
std::map<pg_num_t, pg_t> pgs;
uint64_t misplaced_objects = 0, degraded_objects = 0, incomplete_objects = 0;
int peering_state = 0;
unsigned pg_count = 0;
uint64_t next_subop_id = 1;

View File

@ -254,6 +254,7 @@ resume_4:
auto st_it = pg->degraded_objects.find(recovery_state.oid);
st = st_it->second;
pg->degraded_objects.erase(st_it);
degraded_objects--;
}
st->object_count--;
if (st->state == OBJ_DEGRADED)

View File

@ -176,6 +176,9 @@ void osd_t::handle_peers()
if (!p.second.peering_state->list_ops.size())
{
p.second.calc_object_states();
incomplete_objects += p.second.incomplete_objects.size();
misplaced_objects += p.second.misplaced_objects.size();
degraded_objects += p.second.degraded_objects.size();
if (p.second.state & PG_HAS_UNCLEAN)
peering_state = peering_state | OSD_FLUSHING_PGS;
else
@ -256,6 +259,9 @@ void osd_t::start_pg_peering(pg_num_t pg_num)
pg.state = PG_PEERING;
pg.print_state();
pg.state_dict.clear();
incomplete_objects -= pg.incomplete_objects.size();
misplaced_objects -= pg.misplaced_objects.size();
degraded_objects -= pg.degraded_objects.size();
pg.incomplete_objects.clear();
pg.misplaced_objects.clear();
pg.degraded_objects.clear();

View File

@ -204,6 +204,14 @@ void pg_obj_state_check_t::finish_object()
{
printf("Present on: osd %lu, role %ld%s\n", list[i].osd_num, (list[i].oid.stripe & STRIPE_MASK), list[i].is_stable ? " (stable)" : "");
}
if (0)
{
// For future debug level
for (int i = obj_start; i < obj_end; i++)
{
printf("v%lu present on: osd %lu, role %ld%s\n", list[i].version, list[i].osd_num, (list[i].oid.stripe & STRIPE_MASK), list[i].is_stable ? " (stable)" : "");
}
}
state = OBJ_INCOMPLETE;
pg->state = pg->state | PG_HAS_INCOMPLETE;
}

View File

@ -42,7 +42,8 @@ void osd_t::finish_op(osd_op_t *cur_op, int retval)
{
if (!cur_op->peer_fd)
{
cur_op->callback(cur_op);
// Copy lambda to be unaffected by `delete op`
std::function<void(osd_op_t*)>(cur_op->callback)(cur_op);
}
else
{
@ -254,7 +255,7 @@ void osd_t::submit_primary_subops(int submit_type, int pg_size, const uint64_t*
if (subop->opcode == BS_OP_WRITE && subop->retval != subop->len)
{
// die
throw std::runtime_error("local write operation failed");
throw std::runtime_error("local write operation failed (retval = "+std::to_string(subop->retval)+")");
}
handle_primary_subop(
subop->opcode == BS_OP_WRITE ? OSD_OP_SECONDARY_WRITE : OSD_OP_SECONDARY_READ,
@ -298,17 +299,19 @@ void osd_t::submit_primary_subops(int submit_type, int pg_size, const uint64_t*
}
subops[subop].callback = [cur_op, this](osd_op_t *subop)
{
int fail_fd = subop->req.hdr.opcode == OSD_OP_SECONDARY_WRITE &&
subop->reply.hdr.retval != subop->req.sec_rw.len ? subop->peer_fd : -1;
// so it doesn't get freed
subop->buf = NULL;
if (subop->req.hdr.opcode == OSD_OP_SECONDARY_WRITE && cur_op->reply.hdr.retval != cur_op->req.sec_rw.len)
{
// write operation failed, drop the connection
stop_client(subop->peer_fd);
}
handle_primary_subop(
subop->req.hdr.opcode, cur_op, subop->reply.hdr.retval,
subop->req.sec_rw.len, subop->reply.sec_rw.version
);
if (fail_fd >= 0)
{
// write operation failed, drop the connection
stop_client(fail_fd);
}
};
outbox_push(clients[subops[subop].peer_fd], &subops[subop]);
}
@ -322,8 +325,11 @@ void osd_t::handle_primary_subop(uint64_t opcode, osd_op_t *cur_op, int retval,
osd_primary_op_data_t *op_data = cur_op->op_data;
if (retval != expected)
{
printf("%s subop failed: retval = %d (expected %d)\n", osd_op_names[opcode], retval, expected);
if (retval == -EPIPE)
{
op_data->epipe++;
}
op_data->errors++;
}
else
@ -565,7 +571,7 @@ resume_6:
op_data->st = 6;
return;
resume_7:
// FIXME: Free them correctly (via a destructor or so)
// FIXME: Free those in the destructor?
delete op_data->unstable_write_osds;
delete[] op_data->unstable_writes;
op_data->unstable_writes = NULL;
@ -796,12 +802,13 @@ void osd_t::submit_primary_sync_subops(osd_op_t *cur_op)
};
subops[i].callback = [cur_op, this](osd_op_t *subop)
{
if (cur_op->reply.hdr.retval != 0)
int fail_fd = subop->reply.hdr.retval != 0 ? subop->peer_fd : 0;
handle_primary_subop(OSD_OP_SECONDARY_SYNC, cur_op, subop->reply.hdr.retval, 0, 0);
if (fail_fd >= 0)
{
// sync operation failed, drop the connection
stop_client(subop->peer_fd);
stop_client(fail_fd);
}
handle_primary_subop(OSD_OP_SECONDARY_SYNC, cur_op, subop->reply.hdr.retval, 0, 0);
};
outbox_push(clients[subops[i].peer_fd], &subops[i]);
}
@ -853,12 +860,13 @@ void osd_t::submit_primary_stab_subops(osd_op_t *cur_op)
subops[i].send_list.push_back(op_data->unstable_writes + stab_osd.start, stab_osd.len * sizeof(obj_ver_id));
subops[i].callback = [cur_op, this](osd_op_t *subop)
{
if (cur_op->reply.hdr.retval != 0)
int fail_fd = subop->reply.hdr.retval != 0 ? subop->peer_fd : 0;
handle_primary_subop(OSD_OP_SECONDARY_STABILIZE, cur_op, subop->reply.hdr.retval, 0, 0);
if (fail_fd >= 0)
{
// sync operation failed, drop the connection
stop_client(subop->peer_fd);
stop_client(fail_fd);
}
handle_primary_subop(OSD_OP_SECONDARY_STABILIZE, cur_op, subop->reply.hdr.retval, 0, 0);
};
outbox_push(clients[subops[i].peer_fd], &subops[i]);
}

View File

@ -247,6 +247,7 @@ void osd_t::handle_reply_hdr(osd_client_t *cl)
(tv_end.tv_sec - op->tv_begin.tv_sec)*1000000 +
(tv_end.tv_nsec - op->tv_begin.tv_nsec)/1000
);
op->callback(op);
// Copy lambda to be unaffected by `delete op`
std::function<void(osd_op_t*)>(op->callback)(op);
}
}