Test & fix single-PG primary OSD

- Add support for benchmarking single primary OSD in fio_sec_osd
- Do not wait for the next event in flushers (return resume_0 back)
- Fix flushing of zero-length writes
- Print PG object count when peering
- Print journal free space when starting and when congested
blocking-uring-test
Vitaliy Filippov 2020-02-26 18:32:00 +03:00
parent df66a76ce2
commit 1733de2db6
8 changed files with 133 additions and 63 deletions

View File

@ -154,6 +154,7 @@ bool journal_flusher_co::loop()
goto resume_17; goto resume_17;
else if (wait_state == 18) else if (wait_state == 18)
goto resume_18; goto resume_18;
resume_0:
if (!flusher->flush_queue.size() || if (!flusher->flush_queue.size() ||
!flusher->start_forced && !flusher->active_flushers && flusher->flush_queue.size() < flusher->sync_threshold) !flusher->start_forced && !flusher->active_flushers && flusher->flush_queue.size() < flusher->sync_threshold)
{ {
@ -181,7 +182,7 @@ bool journal_flusher_co::loop()
if (repeat_it->second < cur.version) if (repeat_it->second < cur.version)
repeat_it->second = cur.version; repeat_it->second = cur.version;
wait_state = 0; wait_state = 0;
return true; goto resume_0;
} }
else else
flusher->sync_to_repeat[cur.oid] = 0; flusher->sync_to_repeat[cur.oid] = 0;
@ -196,7 +197,7 @@ resume_1:
wait_state += 1; wait_state += 1;
return false; return false;
} }
if (copy_count == 0 && clean_loc == UINT64_MAX && !has_delete) if (copy_count == 0 && clean_loc == UINT64_MAX && !has_delete && !has_empty)
{ {
// Nothing to flush // Nothing to flush
flusher->active_flushers--; flusher->active_flushers--;
@ -208,7 +209,7 @@ resume_1:
} }
flusher->sync_to_repeat.erase(repeat_it); flusher->sync_to_repeat.erase(repeat_it);
wait_state = 0; wait_state = 0;
return true; goto resume_0;
} }
// Find it in clean_db // Find it in clean_db
clean_it = bs->clean_db.find(cur.oid); clean_it = bs->clean_db.find(cur.oid);
@ -427,7 +428,7 @@ resume_1:
} }
flusher->sync_to_repeat.erase(repeat_it); flusher->sync_to_repeat.erase(repeat_it);
wait_state = 0; wait_state = 0;
return true; goto resume_0;
} }
return true; return true;
} }
@ -444,6 +445,7 @@ bool journal_flusher_co::scan_dirty(int wait_base)
copy_count = 0; copy_count = 0;
clean_loc = UINT64_MAX; clean_loc = UINT64_MAX;
has_delete = false; has_delete = false;
has_empty = false;
skip_copy = false; skip_copy = false;
clean_init_bitmap = false; clean_init_bitmap = false;
while (1) while (1)
@ -451,40 +453,47 @@ bool journal_flusher_co::scan_dirty(int wait_base)
if (dirty_it->second.state == ST_J_STABLE && !skip_copy) if (dirty_it->second.state == ST_J_STABLE && !skip_copy)
{ {
// First we submit all reads // First we submit all reads
offset = dirty_it->second.offset; if (dirty_it->second.len == 0)
end_offset = dirty_it->second.offset + dirty_it->second.len;
it = v.begin();
while (1)
{ {
for (; it != v.end(); it++) has_empty = true;
if (it->offset >= offset) }
break; else
if (it == v.end() || it->offset > offset && it->len > 0) {
offset = dirty_it->second.offset;
end_offset = dirty_it->second.offset + dirty_it->second.len;
it = v.begin();
while (1)
{ {
submit_offset = dirty_it->second.location + offset - dirty_it->second.offset; for (; it != v.end(); it++)
submit_len = it == v.end() || it->offset >= end_offset ? end_offset-offset : it->offset-offset; if (it->offset >= offset)
it = v.insert(it, (copy_buffer_t){ .offset = offset, .len = submit_len, .buf = memalign(MEM_ALIGNMENT, submit_len) }); break;
copy_count++; if (it == v.end() || it->offset > offset && it->len > 0)
if (bs->journal.inmemory)
{ {
// Take it from memory submit_offset = dirty_it->second.location + offset - dirty_it->second.offset;
memcpy(v.back().buf, bs->journal.buffer + submit_offset, submit_len); submit_len = it == v.end() || it->offset >= end_offset ? end_offset-offset : it->offset-offset;
} it = v.insert(it, (copy_buffer_t){ .offset = offset, .len = submit_len, .buf = memalign(MEM_ALIGNMENT, submit_len) });
else copy_count++;
{ if (bs->journal.inmemory)
// Read it from disk {
await_sqe(0); // Take it from memory
data->iov = (struct iovec){ v.back().buf, (size_t)submit_len }; memcpy(v.back().buf, bs->journal.buffer + submit_offset, submit_len);
data->callback = simple_callback_r; }
my_uring_prep_readv( else
sqe, bs->journal.fd, &data->iov, 1, bs->journal.offset + submit_offset {
); // Read it from disk
wait_count++; await_sqe(0);
data->iov = (struct iovec){ v.back().buf, (size_t)submit_len };
data->callback = simple_callback_r;
my_uring_prep_readv(
sqe, bs->journal.fd, &data->iov, 1, bs->journal.offset + submit_offset
);
wait_count++;
}
} }
offset = it->offset+it->len;
if (it == v.end() || offset >= end_offset)
break;
} }
offset = it->offset+it->len;
if (it == v.end() || offset >= end_offset)
break;
} }
} }
else if (dirty_it->second.state == ST_D_STABLE && !skip_copy) else if (dirty_it->second.state == ST_D_STABLE && !skip_copy)

View File

@ -45,7 +45,7 @@ class journal_flusher_co
std::map<object_id, uint64_t>::iterator repeat_it; std::map<object_id, uint64_t>::iterator repeat_it;
std::function<void(ring_data_t*)> simple_callback_r, simple_callback_w; std::function<void(ring_data_t*)> simple_callback_r, simple_callback_w;
bool skip_copy, has_delete; bool skip_copy, has_delete, has_empty;
spp::sparse_hash_map<object_id, clean_entry>::iterator clean_it; spp::sparse_hash_map<object_id, clean_entry>::iterator clean_it;
std::vector<copy_buffer_t> v; std::vector<copy_buffer_t> v;
std::vector<copy_buffer_t>::iterator it; std::vector<copy_buffer_t>::iterator it;

View File

@ -402,7 +402,15 @@ resume_1:
} }
// Trim journal on start so we don't stall when all entries are older // Trim journal on start so we don't stall when all entries are older
bs->journal.trim(); bs->journal.trim();
printf("Journal entries loaded: %lu, free blocks: %lu / %lu\n", entries_loaded, bs->data_alloc->get_free_count(), bs->block_count); printf(
"Journal entries loaded: %lu, free journal space: %lu bytes (%lu..%lu is used), free blocks: %lu / %lu\n",
entries_loaded,
(bs->journal.next_free >= bs->journal.used_start
? bs->journal.len-bs->journal.block_size - (bs->journal.next_free-bs->journal.used_start)
: bs->journal.used_start - bs->journal.next_free),
bs->journal.used_start, bs->journal.next_free,
bs->data_alloc->get_free_count(), bs->block_count
);
bs->journal.crc32_last = crc32_last; bs->journal.crc32_last = crc32_last;
return 0; return 0;
} }

View File

@ -67,6 +67,12 @@ int blockstore_journal_check_t::check_available(blockstore_op_t *op, int require
if (!right_dir && next_pos >= bs->journal.used_start-bs->journal.block_size) if (!right_dir && next_pos >= bs->journal.used_start-bs->journal.block_size)
{ {
// No space in the journal. Wait until used_start changes. // No space in the journal. Wait until used_start changes.
printf(
"Ran out of journal space (free space: %lu bytes)\n",
(bs->journal.next_free >= bs->journal.used_start
? bs->journal.len-bs->journal.block_size - (bs->journal.next_free-bs->journal.used_start)
: bs->journal.used_start - bs->journal.next_free)
);
PRIV(op)->wait_for = WAIT_JOURNAL; PRIV(op)->wait_for = WAIT_JOURNAL;
bs->flusher->force_start(); bs->flusher->force_start();
PRIV(op)->wait_detail = bs->journal.used_start; PRIV(op)->wait_detail = bs->journal.used_start;

View File

@ -5,7 +5,7 @@
// Random write: // Random write:
// //
// fio -thread -ioengine=./libfio_sec_osd.so -name=test -bs=4k -direct=1 -fsync=16 -iodepth=16 -rw=randwrite \ // fio -thread -ioengine=./libfio_sec_osd.so -name=test -bs=4k -direct=1 -fsync=16 -iodepth=16 -rw=randwrite \
// -host=127.0.0.1 -port=11203 -size=1000M // -host=127.0.0.1 -port=11203 [-single_primary=1] -size=1000M
// //
// Linear write: // Linear write:
// //
@ -50,6 +50,7 @@ struct sec_options
int __pad; int __pad;
char *host = NULL; char *host = NULL;
int port = 0; int port = 0;
bool single_primary = false;
}; };
static struct fio_option options[] = { static struct fio_option options[] = {
@ -71,6 +72,16 @@ static struct fio_option options[] = {
.category = FIO_OPT_C_ENGINE, .category = FIO_OPT_C_ENGINE,
.group = FIO_OPT_G_FILENAME, .group = FIO_OPT_G_FILENAME,
}, },
{
.name = "single_primary",
.lname = "Single Primary",
.type = FIO_OPT_BOOL,
.off1 = offsetof(struct sec_options, single_primary),
.help = "Test single Primary OSD (one PG) instead of Secondary",
.def = "0",
.category = FIO_OPT_C_ENGINE,
.group = FIO_OPT_G_FILENAME,
},
{ {
.name = NULL, .name = NULL,
}, },
@ -150,6 +161,7 @@ static int sec_init(struct thread_data *td)
/* Begin read or write request. */ /* Begin read or write request. */
static enum fio_q_status sec_queue(struct thread_data *td, struct io_u *io) static enum fio_q_status sec_queue(struct thread_data *td, struct io_u *io)
{ {
sec_options *opt = (sec_options*)td->eo;
sec_data *bsd = (sec_data*)td->io_ops_data; sec_data *bsd = (sec_data*)td->io_ops_data;
int n = bsd->op_n; int n = bsd->op_n;
@ -167,31 +179,59 @@ static enum fio_q_status sec_queue(struct thread_data *td, struct io_u *io)
switch (io->ddir) switch (io->ddir)
{ {
case DDIR_READ: case DDIR_READ:
op.hdr.opcode = OSD_OP_SECONDARY_READ; if (!opt->single_primary)
op.sec_rw.oid = { {
.inode = 1, op.hdr.opcode = OSD_OP_SECONDARY_READ;
.stripe = io->offset >> bsd->block_order, op.sec_rw.oid = {
}; .inode = 1,
op.sec_rw.version = UINT64_MAX; // last unstable .stripe = io->offset >> bsd->block_order,
op.sec_rw.offset = io->offset % bsd->block_size; };
op.sec_rw.len = io->xfer_buflen; op.sec_rw.version = UINT64_MAX; // last unstable
op.sec_rw.offset = io->offset % bsd->block_size;
op.sec_rw.len = io->xfer_buflen;
}
else
{
op.hdr.opcode = OSD_OP_READ;
op.rw.inode = 1;
op.rw.offset = io->offset;
op.rw.len = io->xfer_buflen;
}
bsd->last_sync = false; bsd->last_sync = false;
break; break;
case DDIR_WRITE: case DDIR_WRITE:
op.hdr.opcode = OSD_OP_SECONDARY_WRITE; if (!opt->single_primary)
op.sec_rw.oid = { {
.inode = 1, op.hdr.opcode = OSD_OP_SECONDARY_WRITE;
.stripe = io->offset >> bsd->block_order, op.sec_rw.oid = {
}; .inode = 1,
op.sec_rw.version = 0; // assign automatically .stripe = io->offset >> bsd->block_order,
op.sec_rw.offset = io->offset % bsd->block_size; };
op.sec_rw.len = io->xfer_buflen; op.sec_rw.version = 0; // assign automatically
op.sec_rw.offset = io->offset % bsd->block_size;
op.sec_rw.len = io->xfer_buflen;
}
else
{
op.hdr.opcode = OSD_OP_WRITE;
op.rw.inode = 1;
op.rw.offset = io->offset;
op.rw.len = io->xfer_buflen;
}
bsd->last_sync = false; bsd->last_sync = false;
break; break;
case DDIR_SYNC: case DDIR_SYNC:
// Allowed only for testing: sync & stabilize all unstable object versions if (!opt->single_primary)
op.hdr.opcode = OSD_OP_TEST_SYNC_STAB_ALL; {
// fio sends 32 syncs with -fsync=32. we omit 31 of them even though it's not 100% fine (FIXME: fix fio itself) // Allowed only for testing: sync & stabilize all unstable object versions
op.hdr.opcode = OSD_OP_TEST_SYNC_STAB_ALL;
}
else
{
op.hdr.opcode = OSD_OP_SYNC;
}
// fio sends 32 syncs with -fsync=32. we omit 31 of them even though
// generally it may not be 100% correct (FIXME: fix fio itself)
bsd->last_sync = true; bsd->last_sync = true;
break; break;
default: default:

View File

@ -133,6 +133,7 @@ void pg_t::remember_object(pg_obj_state_check_t &st, std::vector<obj_ver_role> &
} }
else else
pg.clean_count++; pg.clean_count++;
pg.total_count++;
} }
// FIXME: Write at least some tests for this function // FIXME: Write at least some tests for this function
@ -167,6 +168,7 @@ void pg_t::calc_object_states()
std::sort(all.begin(), all.end()); std::sort(all.begin(), all.end());
// Walk over it and check object states // Walk over it and check object states
pg.clean_count = 0; pg.clean_count = 0;
pg.total_count = 0;
pg.state = 0; pg.state = 0;
int replica = 0; int replica = 0;
pg_obj_state_check_t st; pg_obj_state_check_t st;
@ -251,12 +253,13 @@ void pg_t::calc_object_states()
pg.state = pg.state | PG_DEGRADED; pg.state = pg.state | PG_DEGRADED;
} }
printf( printf(
"PG %u is active%s%s%s%s%s\n", pg.pg_num, "PG %u is active%s%s%s%s%s (%lu objects)\n", pg.pg_num,
(pg.state & PG_DEGRADED) ? " + degraded" : "", (pg.state & PG_DEGRADED) ? " + degraded" : "",
(pg.state & PG_HAS_UNFOUND) ? " + has_unfound" : "", (pg.state & PG_HAS_UNFOUND) ? " + has_unfound" : "",
(pg.state & PG_HAS_DEGRADED) ? " + has_degraded" : "", (pg.state & PG_HAS_DEGRADED) ? " + has_degraded" : "",
(pg.state & PG_HAS_MISPLACED) ? " + has_misplaced" : "", (pg.state & PG_HAS_MISPLACED) ? " + has_misplaced" : "",
(pg.state & PG_HAS_UNCLEAN) ? " + has_unclean" : "" (pg.state & PG_HAS_UNCLEAN) ? " + has_unclean" : "",
pg.total_count
); );
pg.state = pg.state | PG_ACTIVE; pg.state = pg.state | PG_ACTIVE;
} }

View File

@ -111,7 +111,7 @@ struct pg_t
int state; int state;
uint64_t pg_cursize = 3, pg_size = 3, pg_minsize = 2; uint64_t pg_cursize = 3, pg_size = 3, pg_minsize = 2;
pg_num_t pg_num; pg_num_t pg_num;
uint64_t clean_count = 0; uint64_t clean_count = 0, total_count = 0;
// target_set is the "correct" peer OSD set for this PG // target_set is the "correct" peer OSD set for this PG
std::vector<osd_num_t> target_set; std::vector<osd_num_t> target_set;
// cur_set is the current set of connected peer OSDs for this PG // cur_set is the current set of connected peer OSDs for this PG

View File

@ -328,17 +328,18 @@ void osd_t::continue_primary_write(osd_op_t *cur_op)
return; return;
} }
} }
resume_1:
// Check if there are other write requests to the same object // Check if there are other write requests to the same object
{ {
auto vo_it = pg.ver_override.find(op_data->oid); auto vo_it = pg.write_queue.find(op_data->oid);
if (vo_it != pg.ver_override.end()) if (vo_it != pg.write_queue.end())
{ {
op_data->st = 1; op_data->st = 1;
pg.write_queue.emplace(op_data->oid, cur_op); pg.write_queue.emplace(op_data->oid, cur_op);
return; return;
} }
pg.write_queue.emplace(op_data->oid, cur_op);
} }
resume_1:
// Determine blocks to read // Determine blocks to read
cur_op->rmw_buf = calc_rmw_reads(cur_op->buf, op_data->stripes, pg.cur_set.data(), pg.pg_size, pg.pg_minsize, pg.pg_cursize); cur_op->rmw_buf = calc_rmw_reads(cur_op->buf, op_data->stripes, pg.cur_set.data(), pg.pg_size, pg.pg_minsize, pg.pg_cursize);
// Read required blocks // Read required blocks
@ -381,10 +382,13 @@ resume_5:
// Continue other write operations to the same object // Continue other write operations to the same object
{ {
auto next_it = pg.write_queue.find(op_data->oid); auto next_it = pg.write_queue.find(op_data->oid);
if (next_it != pg.write_queue.end()) auto this_it = next_it;
next_it++;
pg.write_queue.erase(this_it);
if (next_it != pg.write_queue.end() &&
next_it->first == op_data->oid)
{ {
osd_op_t *next_op = next_it->second; osd_op_t *next_op = next_it->second;
pg.write_queue.erase(next_it);
continue_primary_write(next_op); continue_primary_write(next_op);
} }
} }