Continue sync implementation

blocking-uring-test
Vitaliy Filippov 2019-11-09 02:16:44 +03:00
parent 7456f0f7e2
commit 8e634d5b74
3 changed files with 122 additions and 32 deletions

View File

@ -93,15 +93,13 @@ void blockstore::handle_event(ring_data_t *data)
// FIXME: our state becomes corrupted after a write error. maybe do something better than just die // FIXME: our state becomes corrupted after a write error. maybe do something better than just die
throw new std::runtime_error("write operation failed. in-memory state is corrupted. AAAAAAAaaaaaaaaa!!!111"); throw new std::runtime_error("write operation failed. in-memory state is corrupted. AAAAAAAaaaaaaaaa!!!111");
} }
if (op->used_journal_sector > 0) if (op->min_used_journal_sector > 0)
{ {
uint64_t s = op->used_journal_sector-1; for (uint64_t s = op->min_used_journal_sector; s <= op->max_used_journal_sector; s++)
if (journal.sector_info[s].usage_count > 0)
{ {
// The last write to this journal sector was made by this op, release the buffer journal.sector_info[s-1].usage_count--;
journal.sector_info[s].usage_count--;
} }
op->used_journal_sector = 0; op->min_used_journal_sector = op->max_used_journal_sector = 0;
} }
if (op->pending_ops == 0) if (op->pending_ops == 0)
{ {
@ -123,6 +121,14 @@ void blockstore::handle_event(ring_data_t *data)
} }
else if ((op->flags & OP_TYPE_MASK) == OP_SYNC) else if ((op->flags & OP_TYPE_MASK) == OP_SYNC)
{ {
if (op->min_used_journal_sector > 0)
{
for (uint64_t s = op->min_used_journal_sector; s <= op->max_used_journal_sector; s++)
{
journal.sector_info[s-1].usage_count--;
}
op->min_used_journal_sector = op->max_used_journal_sector = 0;
}
} }
else if ((op->flags & OP_TYPE_MASK) == OP_STABLE) else if ((op->flags & OP_TYPE_MASK) == OP_STABLE)

View File

@ -69,11 +69,20 @@
struct io_uring_sqe *sqe = get_sqe();\ struct io_uring_sqe *sqe = get_sqe();\
if (!sqe)\ if (!sqe)\
{\ {\
// Pause until there are more requests available\ /* Pause until there are more requests available */\
op->wait_for = WAIT_SQE;\ op->wait_for = WAIT_SQE;\
return 0;\ return 0;\
}\ }\
struct ring_data_t *data = ((ring_data_t*)sqe->user_data); struct ring_data_t *data = ((ring_data_t*)sqe->user_data)
#define BS_SUBMIT_GET_SQE_DECL(sqe) \
sqe = get_sqe();\
if (!sqe)\
{\
/* Pause until there are more requests available */\
op->wait_for = WAIT_SQE;\
return 0;\
}
// 16 bytes per object/stripe id // 16 bytes per object/stripe id
// stripe includes replica number in 4 least significant bits // stripe includes replica number in 4 least significant bits
@ -202,9 +211,10 @@ struct blockstore_operation
// FIXME make all of these pointers and put them into a union // FIXME make all of these pointers and put them into a union
std::map<uint64_t, struct iovec> read_vec; std::map<uint64_t, struct iovec> read_vec;
uint64_t used_journal_sector; uint64_t min_used_journal_sector, max_used_journal_sector;
std::deque<obj_ver_id> sync_writes; std::deque<obj_ver_id> sync_writes;
bool has_big_writes; int big_write_count;
int big_write_state;
}; };
class blockstore; class blockstore;

View File

@ -18,7 +18,7 @@ int blockstore::dequeue_write(blockstore_operation *op)
op->callback(op); op->callback(op);
return 1; return 1;
} }
BS_GET_SQE(sqe, data); BS_SUBMIT_GET_SQE(sqe, data);
dirty_it->second.location = loc << block_order; dirty_it->second.location = loc << block_order;
dirty_it->second.state = ST_D_SUBMITTED; dirty_it->second.state = ST_D_SUBMITTED;
allocator_set(data_alloc, loc, true); allocator_set(data_alloc, loc, true);
@ -28,7 +28,7 @@ int blockstore::dequeue_write(blockstore_operation *op)
sqe, data_fd, &data->iov, 1, data_offset + (loc << block_order) sqe, data_fd, &data->iov, 1, data_offset + (loc << block_order)
); );
op->pending_ops = 1; op->pending_ops = 1;
op->used_journal_sector = 0; op->min_used_journal_sector = op->max_used_journal_sector = 0;
} }
else else
{ {
@ -38,11 +38,9 @@ int blockstore::dequeue_write(blockstore_operation *op)
uint64_t next_pos = journal.next_free; uint64_t next_pos = journal.next_free;
if (512 - journal.in_sector_pos < sizeof(struct journal_entry_small_write)) if (512 - journal.in_sector_pos < sizeof(struct journal_entry_small_write))
{ {
next_pos = next_pos + 512;
//if (journal.len - next_pos < op->len) //if (journal.len - next_pos < op->len)
// two_sqes = true; // two_sqes = true;
if (next_pos >= journal.len) next_pos = (next_pos+512) < journal.len ? next_pos+512 : 512;
next_pos = 512;
// Also check if we have an unused memory buffer for the journal sector // Also check if we have an unused memory buffer for the journal sector
if (journal.sector_info[((journal.cur_sector + 1) % journal.sector_count)].usage_count > 0) if (journal.sector_info[((journal.cur_sector + 1) % journal.sector_count)].usage_count > 0)
{ {
@ -64,8 +62,8 @@ int blockstore::dequeue_write(blockstore_operation *op)
} }
// There is sufficient space. Get SQE(s) // There is sufficient space. Get SQE(s)
unsigned prev_sqe_pos = ringloop->ring->sq.sqe_tail; unsigned prev_sqe_pos = ringloop->ring->sq.sqe_tail;
BS_GET_SQE(sqe1, data1); BS_SUBMIT_GET_SQE(sqe1, data1);
BS_GET_SQE(sqe2, data2); BS_SUBMIT_GET_SQE(sqe2, data2);
// Got SQEs. Prepare journal sector write // Got SQEs. Prepare journal sector write
if (512 - journal.in_sector_pos < sizeof(struct journal_entry_small_write)) if (512 - journal.in_sector_pos < sizeof(struct journal_entry_small_write))
{ {
@ -74,7 +72,7 @@ int blockstore::dequeue_write(blockstore_operation *op)
journal.cur_sector = ((journal.cur_sector + 1) % journal.sector_count); journal.cur_sector = ((journal.cur_sector + 1) % journal.sector_count);
journal.sector_info[journal.cur_sector].offset = journal.next_free; journal.sector_info[journal.cur_sector].offset = journal.next_free;
journal.in_sector_pos = 0; journal.in_sector_pos = 0;
journal.next_free = (journal.next_free + 512) >= journal.len ? journal.next_free + 512 : 512; journal.next_free = (journal.next_free+512) < journal.len ? journal.next_free + 512 : 512;
memset(journal.sector_buf + 512*journal.cur_sector, 0, 512); memset(journal.sector_buf + 512*journal.cur_sector, 0, 512);
} }
journal_entry_small_write *je = (struct journal_entry_small_write*)( journal_entry_small_write *je = (struct journal_entry_small_write*)(
@ -92,14 +90,14 @@ int blockstore::dequeue_write(blockstore_operation *op)
.len = op->len, .len = op->len,
}; };
je->crc32 = je_crc32((journal_entry*)je); je->crc32 = je_crc32((journal_entry*)je);
journal.crc32_last = je->crc32;
data1->iov = (struct iovec){ journal.sector_buf + 512*journal.cur_sector, 512 }; data1->iov = (struct iovec){ journal.sector_buf + 512*journal.cur_sector, 512 };
data1->op = op; data1->op = op;
io_uring_prep_writev( io_uring_prep_writev(
sqe1, journal.fd, &data1->iov, 1, journal.offset + journal.sector_info[journal.cur_sector].offset sqe1, journal.fd, &data1->iov, 1, journal.offset + journal.sector_info[journal.cur_sector].offset
); );
// Prepare journal data write // Prepare journal data write
if (journal.len - journal.next_free < op->len) journal.next_free = (journal.next_free + op->len) < journal.len ? journal.next_free + op->len : 512;
journal.next_free = 512;
data2->iov = (struct iovec){ op->buf, op->len }; data2->iov = (struct iovec){ op->buf, op->len };
data2->op = op; data2->op = op;
io_uring_prep_writev( io_uring_prep_writev(
@ -109,32 +107,31 @@ int blockstore::dequeue_write(blockstore_operation *op)
dirty_it->second.state = ST_J_SUBMITTED; dirty_it->second.state = ST_J_SUBMITTED;
// Move journal.next_free and save last write for current sector // Move journal.next_free and save last write for current sector
journal.next_free += op->len; journal.next_free += op->len;
if (journal.next_free >= journal.len)
journal.next_free = 512;
journal.sector_info[journal.cur_sector].usage_count++; journal.sector_info[journal.cur_sector].usage_count++;
journal.crc32_last = je->crc32;
op->pending_ops = 2; op->pending_ops = 2;
op->used_journal_sector = 1 + journal.cur_sector; op->min_used_journal_sector = op->max_used_journal_sector = 1 + journal.cur_sector;
} }
return 1; return 1;
} }
int blockstore::dequeue_sync(blockstore_operation *op) int blockstore::dequeue_sync(blockstore_operation *op)
{ {
op->has_big_writes = 0x10000; op->big_write_count = 0;
op->big_write_state = 0x10000;
op->sync_writes.swap(unsynced_writes); op->sync_writes.swap(unsynced_writes);
unsynced_writes.clear(); unsynced_writes.clear();
auto it = sync_writes.begin(); auto it = op->sync_writes.begin();
while (it != sync_writes.end()) while (it != op->sync_writes.end())
{ {
uint32_t state = dirty_db[*it].state; uint32_t state = dirty_db[*it].state;
if (IS_BIG_WRITE(state)) if (IS_BIG_WRITE(state))
{ {
op->has_big_writes = op->has_big_writes < state ? op->has_big_writes : state; op->big_write_count++;
op->big_write_state = op->big_write_state < state ? op->big_write_state : state;
} }
it++; it++;
} }
if (op->has_big_writes == 0x10000 || op->has_big_writes == ST_D_META_WRITTEN) if (op->big_write_count == 0 || op->big_write_state == ST_D_META_WRITTEN)
{ {
// Just fsync the journal // Just fsync the journal
BS_SUBMIT_GET_SQE(sqe, data); BS_SUBMIT_GET_SQE(sqe, data);
@ -142,7 +139,7 @@ int blockstore::dequeue_sync(blockstore_operation *op)
data->op = op; data->op = op;
op->pending_ops = 1; op->pending_ops = 1;
} }
else if (op->has_big_writes == ST_D_WRITTEN) else if (op->big_write_state == ST_D_WRITTEN)
{ {
// 1st step: fsync data // 1st step: fsync data
BS_SUBMIT_GET_SQE(sqe, data); BS_SUBMIT_GET_SQE(sqe, data);
@ -150,10 +147,87 @@ int blockstore::dequeue_sync(blockstore_operation *op)
data->op = op; data->op = op;
op->pending_ops = 1; op->pending_ops = 1;
} }
else if (op->has_big_writes == ST_D_SYNCED) else if (op->big_write_state == ST_D_SYNCED)
{ {
// 2nd step: Data device is synced, prepare & write journal entries // 2nd step: Data device is synced, prepare & write journal entries
// Check space in the journal and journal memory buffers
int required = op->big_write_count, sectors_required = 1;
uint64_t next_pos = journal.next_free, next_sector = journal.cur_sector;
while (1)
{
int fits = (512 - journal.in_sector_pos) / sizeof(journal_entry_big_write);
required -= fits;
if (required <= 0)
break;
next_pos = (next_pos+512) < journal.len ? next_pos+512 : 512;
sectors_required++;
next_sector = ((next_sector + 1) % journal.sector_count);
if (journal.sector_info[next_sector].usage_count > 0)
{
// No memory buffer available. Wait for it.
op->wait_for = WAIT_JOURNAL_BUFFER;
return 0;
}
}
if (next_pos >= journal.used_start)
{
// No space in the journal. Wait for it.
op->wait_for = WAIT_JOURNAL;
op->wait_detail = next_pos;
return 0;
}
// Get SQEs. Don't bother about merging, submit each journal sector as a separate request
struct io_uring_sqe *sqe[sectors_required];
for (int i = 0; i < sectors_required; i++)
{
BS_SUBMIT_GET_SQE_DECL(sqe[i]);
}
// Prepare and submit journal entries
op->min_used_journal_sector = 1 + journal.cur_sector;
sectors_required = 0;
required = op->big_write_count;
it = op->sync_writes.begin();
while (1)
{
int fits = (512 - journal.in_sector_pos) / sizeof(journal_entry_big_write);
while (fits > 0 && required > 0)
{
journal_entry_big_write *je = (journal_entry_big_write*)(
journal.sector_buf + 512*journal.cur_sector + journal.in_sector_pos
);
*je = {
.crc32 = 0,
.magic = JOURNAL_MAGIC,
.type = JE_BIG_WRITE,
.size = sizeof(journal_entry_big_write),
.crc32_prev = journal.crc32_last,
.oid = it->oid,
.version = it->version,
.block = dirty_db[*it].location,
};
je->crc32 = je_crc32((journal_entry*)je);
journal.crc32_last = je->crc32;
journal.in_sector_pos += sizeof(journal_entry_big_write);
required--;
}
if (required <= 0)
break;
journal.sector_info[journal.cur_sector].usage_count++;
struct ring_data_t *data = ((ring_data_t*)sqe[sectors_required]->user_data);
data->iov = (struct iovec){ journal.sector_buf + 512*journal.cur_sector, 512 };
data->op = op;
io_uring_prep_writev(
sqe[sectors_required], journal.fd, &data->iov, 1, journal.offset + journal.sector_info[journal.cur_sector].offset
);
journal.cur_sector = ((journal.cur_sector + 1) % journal.sector_count);
journal.sector_info[journal.cur_sector].offset = journal.next_free;
journal.in_sector_pos = 0;
journal.next_free = (journal.next_free + 512) < journal.len ? journal.next_free + 512 : 512;
memset(journal.sector_buf + 512*journal.cur_sector, 0, 512);
sectors_required++;
}
op->pending_ops = sectors_required;
op->max_used_journal_sector = 1 + journal.cur_sector;
} }
return 1; return 1;
} }