forked from vitalif/vitastor
Do not overwrite same journal sector multiple times
It doesn't reduce actual WA, but it reduces tail latency (Q=32, 10% / 50% / 90% / 99% / 99.95%): - write: 766us/979us/1090us/1303us/1729us vs 1074us/1450us/2212us/3261us/4113us - sync: 701us/881us/1188us/1762us/2540us vs 269us/955us/1663us/2638us/4146usblocking-uring-test
parent
111516381f
commit
a3d3949dce
|
@ -125,6 +125,12 @@ void blockstore_impl_t::loop()
|
||||||
if (PRIV(op)->wait_for)
|
if (PRIV(op)->wait_for)
|
||||||
{
|
{
|
||||||
check_wait(op);
|
check_wait(op);
|
||||||
|
#ifdef BLOCKSTORE_DEBUG
|
||||||
|
if (PRIV(op)->wait_for)
|
||||||
|
{
|
||||||
|
printf("still waiting for %d\n", PRIV(op)->wait_for);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
if (PRIV(op)->wait_for == WAIT_SQE)
|
if (PRIV(op)->wait_for == WAIT_SQE)
|
||||||
{
|
{
|
||||||
break;
|
break;
|
||||||
|
@ -270,7 +276,9 @@ void blockstore_impl_t::check_wait(blockstore_op_t *op)
|
||||||
}
|
}
|
||||||
else if (PRIV(op)->wait_for == WAIT_JOURNAL_BUFFER)
|
else if (PRIV(op)->wait_for == WAIT_JOURNAL_BUFFER)
|
||||||
{
|
{
|
||||||
if (journal.sector_info[((journal.cur_sector + 1) % journal.sector_count)].usage_count > 0)
|
int next = ((journal.cur_sector + 1) % journal.sector_count);
|
||||||
|
if (journal.sector_info[next].usage_count > 0 ||
|
||||||
|
journal.sector_info[next].dirty)
|
||||||
{
|
{
|
||||||
// do not submit
|
// do not submit
|
||||||
return;
|
return;
|
||||||
|
|
|
@ -256,6 +256,8 @@ class blockstore_impl_t
|
||||||
void enqueue_write(blockstore_op_t *op);
|
void enqueue_write(blockstore_op_t *op);
|
||||||
int dequeue_write(blockstore_op_t *op);
|
int dequeue_write(blockstore_op_t *op);
|
||||||
int dequeue_del(blockstore_op_t *op);
|
int dequeue_del(blockstore_op_t *op);
|
||||||
|
void ack_write(blockstore_op_t *op);
|
||||||
|
void release_journal_sectors(blockstore_op_t *op);
|
||||||
void handle_write_event(ring_data_t *data, blockstore_op_t *op);
|
void handle_write_event(ring_data_t *data, blockstore_op_t *op);
|
||||||
|
|
||||||
// Sync
|
// Sync
|
||||||
|
|
|
@ -22,6 +22,11 @@ int blockstore_journal_check_t::check_available(blockstore_op_t *op, int require
|
||||||
next_in_pos += fits * size;
|
next_in_pos += fits * size;
|
||||||
sectors_required++;
|
sectors_required++;
|
||||||
}
|
}
|
||||||
|
else if (bs->journal.sector_info[next_sector].dirty)
|
||||||
|
{
|
||||||
|
// sectors_required is more like "sectors to write"
|
||||||
|
sectors_required++;
|
||||||
|
}
|
||||||
if (required <= 0)
|
if (required <= 0)
|
||||||
{
|
{
|
||||||
break;
|
break;
|
||||||
|
@ -33,13 +38,19 @@ int blockstore_journal_check_t::check_available(blockstore_op_t *op, int require
|
||||||
right_dir = false;
|
right_dir = false;
|
||||||
}
|
}
|
||||||
next_in_pos = 0;
|
next_in_pos = 0;
|
||||||
if (bs->journal.sector_info[next_sector].usage_count > 0)
|
if (bs->journal.sector_info[next_sector].usage_count > 0 ||
|
||||||
|
bs->journal.sector_info[next_sector].dirty)
|
||||||
{
|
{
|
||||||
next_sector = ((next_sector + 1) % bs->journal.sector_count);
|
next_sector = ((next_sector + 1) % bs->journal.sector_count);
|
||||||
}
|
}
|
||||||
if (bs->journal.sector_info[next_sector].usage_count > 0)
|
if (bs->journal.sector_info[next_sector].usage_count > 0 ||
|
||||||
|
bs->journal.sector_info[next_sector].dirty)
|
||||||
{
|
{
|
||||||
// No memory buffer available. Wait for it.
|
// No memory buffer available. Wait for it.
|
||||||
|
#ifdef BLOCKSTORE_DEBUG
|
||||||
|
printf("next journal buffer %d is still dirty=%d used=%d\n", next_sector,
|
||||||
|
bs->journal.sector_info[next_sector].dirty, bs->journal.sector_info[next_sector].usage_count);
|
||||||
|
#endif
|
||||||
PRIV(op)->wait_for = WAIT_JOURNAL_BUFFER;
|
PRIV(op)->wait_for = WAIT_JOURNAL_BUFFER;
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
@ -68,6 +79,7 @@ journal_entry* prefill_single_journal_entry(journal_t & journal, uint16_t type,
|
||||||
{
|
{
|
||||||
if (JOURNAL_BLOCK_SIZE - journal.in_sector_pos < size)
|
if (JOURNAL_BLOCK_SIZE - journal.in_sector_pos < size)
|
||||||
{
|
{
|
||||||
|
assert(!journal.sector_info[journal.cur_sector].dirty);
|
||||||
// Move to the next journal sector
|
// Move to the next journal sector
|
||||||
if (journal.sector_info[journal.cur_sector].usage_count > 0)
|
if (journal.sector_info[journal.cur_sector].usage_count > 0)
|
||||||
{
|
{
|
||||||
|
@ -91,22 +103,24 @@ journal_entry* prefill_single_journal_entry(journal_t & journal, uint16_t type,
|
||||||
je->type = type;
|
je->type = type;
|
||||||
je->size = size;
|
je->size = size;
|
||||||
je->crc32_prev = journal.crc32_last;
|
je->crc32_prev = journal.crc32_last;
|
||||||
|
journal.sector_info[journal.cur_sector].dirty = true;
|
||||||
return je;
|
return je;
|
||||||
}
|
}
|
||||||
|
|
||||||
void prepare_journal_sector_write(journal_t & journal, io_uring_sqe *sqe, std::function<void(ring_data_t*)> cb)
|
void prepare_journal_sector_write(journal_t & journal, int cur_sector, io_uring_sqe *sqe, std::function<void(ring_data_t*)> cb)
|
||||||
{
|
{
|
||||||
journal.sector_info[journal.cur_sector].usage_count++;
|
journal.sector_info[cur_sector].dirty = false;
|
||||||
|
journal.sector_info[cur_sector].usage_count++;
|
||||||
ring_data_t *data = ((ring_data_t*)sqe->user_data);
|
ring_data_t *data = ((ring_data_t*)sqe->user_data);
|
||||||
data->iov = (struct iovec){
|
data->iov = (struct iovec){
|
||||||
(journal.inmemory
|
(journal.inmemory
|
||||||
? journal.buffer + journal.sector_info[journal.cur_sector].offset
|
? journal.buffer + journal.sector_info[cur_sector].offset
|
||||||
: journal.sector_buf + JOURNAL_BLOCK_SIZE*journal.cur_sector),
|
: journal.sector_buf + JOURNAL_BLOCK_SIZE*cur_sector),
|
||||||
JOURNAL_BLOCK_SIZE
|
JOURNAL_BLOCK_SIZE
|
||||||
};
|
};
|
||||||
data->callback = cb;
|
data->callback = cb;
|
||||||
my_uring_prep_writev(
|
my_uring_prep_writev(
|
||||||
sqe, journal.fd, &data->iov, 1, journal.offset + journal.sector_info[journal.cur_sector].offset
|
sqe, journal.fd, &data->iov, 1, journal.offset + journal.sector_info[cur_sector].offset
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -112,6 +112,7 @@ struct journal_sector_info_t
|
||||||
{
|
{
|
||||||
uint64_t offset;
|
uint64_t offset;
|
||||||
uint64_t usage_count;
|
uint64_t usage_count;
|
||||||
|
bool dirty;
|
||||||
};
|
};
|
||||||
|
|
||||||
struct journal_t
|
struct journal_t
|
||||||
|
@ -154,4 +155,4 @@ struct blockstore_journal_check_t
|
||||||
|
|
||||||
journal_entry* prefill_single_journal_entry(journal_t & journal, uint16_t type, uint32_t size);
|
journal_entry* prefill_single_journal_entry(journal_t & journal, uint16_t type, uint32_t size);
|
||||||
|
|
||||||
void prepare_journal_sector_write(journal_t & journal, io_uring_sqe *sqe, std::function<void(ring_data_t*)> cb);
|
void prepare_journal_sector_write(journal_t & journal, int sector, io_uring_sqe *sqe, std::function<void(ring_data_t*)> cb);
|
||||||
|
|
|
@ -94,6 +94,14 @@ int blockstore_impl_t::dequeue_stable(blockstore_op_t *op)
|
||||||
// Prepare and submit journal entries
|
// Prepare and submit journal entries
|
||||||
auto cb = [this, op](ring_data_t *data) { handle_stable_event(data, op); };
|
auto cb = [this, op](ring_data_t *data) { handle_stable_event(data, op); };
|
||||||
int s = 0, cur_sector = -1;
|
int s = 0, cur_sector = -1;
|
||||||
|
if ((JOURNAL_BLOCK_SIZE - journal.in_sector_pos) < sizeof(journal_entry_stable) &&
|
||||||
|
journal.sector_info[journal.cur_sector].dirty)
|
||||||
|
{
|
||||||
|
if (cur_sector == -1)
|
||||||
|
PRIV(op)->min_used_journal_sector = 1 + journal.cur_sector;
|
||||||
|
cur_sector = journal.cur_sector;
|
||||||
|
prepare_journal_sector_write(journal, cur_sector, sqe[s++], cb);
|
||||||
|
}
|
||||||
for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++)
|
for (i = 0, v = (obj_ver_id*)op->buf; i < op->len; i++, v++)
|
||||||
{
|
{
|
||||||
auto unstab_it = unstable_writes.find(v->oid);
|
auto unstab_it = unstable_writes.find(v->oid);
|
||||||
|
@ -104,6 +112,7 @@ int blockstore_impl_t::dequeue_stable(blockstore_op_t *op)
|
||||||
}
|
}
|
||||||
journal_entry_stable *je = (journal_entry_stable*)
|
journal_entry_stable *je = (journal_entry_stable*)
|
||||||
prefill_single_journal_entry(journal, JE_STABLE, sizeof(journal_entry_stable));
|
prefill_single_journal_entry(journal, JE_STABLE, sizeof(journal_entry_stable));
|
||||||
|
journal.sector_info[journal.cur_sector].dirty = false;
|
||||||
je->oid = v->oid;
|
je->oid = v->oid;
|
||||||
je->version = v->version;
|
je->version = v->version;
|
||||||
je->crc32 = je_crc32((journal_entry*)je);
|
je->crc32 = je_crc32((journal_entry*)je);
|
||||||
|
@ -113,7 +122,7 @@ int blockstore_impl_t::dequeue_stable(blockstore_op_t *op)
|
||||||
if (cur_sector == -1)
|
if (cur_sector == -1)
|
||||||
PRIV(op)->min_used_journal_sector = 1 + journal.cur_sector;
|
PRIV(op)->min_used_journal_sector = 1 + journal.cur_sector;
|
||||||
cur_sector = journal.cur_sector;
|
cur_sector = journal.cur_sector;
|
||||||
prepare_journal_sector_write(journal, sqe[s++], cb);
|
prepare_journal_sector_write(journal, cur_sector, sqe[s++], cb);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
PRIV(op)->max_used_journal_sector = 1 + journal.cur_sector;
|
PRIV(op)->max_used_journal_sector = 1 + journal.cur_sector;
|
||||||
|
@ -135,18 +144,7 @@ void blockstore_impl_t::handle_stable_event(ring_data_t *data, blockstore_op_t *
|
||||||
if (PRIV(op)->pending_ops == 0)
|
if (PRIV(op)->pending_ops == 0)
|
||||||
{
|
{
|
||||||
// Release used journal sectors
|
// Release used journal sectors
|
||||||
if (PRIV(op)->min_used_journal_sector > 0)
|
release_journal_sectors(op);
|
||||||
{
|
|
||||||
uint64_t s = PRIV(op)->min_used_journal_sector;
|
|
||||||
while (1)
|
|
||||||
{
|
|
||||||
journal.sector_info[s-1].usage_count--;
|
|
||||||
if (s == PRIV(op)->max_used_journal_sector)
|
|
||||||
break;
|
|
||||||
s = 1 + s % journal.sector_count;
|
|
||||||
}
|
|
||||||
PRIV(op)->min_used_journal_sector = PRIV(op)->max_used_journal_sector = 0;
|
|
||||||
}
|
|
||||||
// First step: mark dirty_db entries as stable, acknowledge op completion
|
// First step: mark dirty_db entries as stable, acknowledge op completion
|
||||||
obj_ver_id* v;
|
obj_ver_id* v;
|
||||||
int i;
|
int i;
|
||||||
|
|
|
@ -39,14 +39,36 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op)
|
||||||
if (PRIV(op)->sync_state == SYNC_HAS_SMALL)
|
if (PRIV(op)->sync_state == SYNC_HAS_SMALL)
|
||||||
{
|
{
|
||||||
// No big writes, just fsync the journal
|
// No big writes, just fsync the journal
|
||||||
|
int n_sqes = disable_fsync ? 0 : 1;
|
||||||
|
if (journal.sector_info[journal.cur_sector].dirty)
|
||||||
|
{
|
||||||
|
n_sqes++;
|
||||||
|
}
|
||||||
|
if (n_sqes > 0)
|
||||||
|
{
|
||||||
|
io_uring_sqe* sqes[n_sqes];
|
||||||
|
for (int i = 0; i < n_sqes; i++)
|
||||||
|
{
|
||||||
|
BS_SUBMIT_GET_SQE_DECL(sqes[i]);
|
||||||
|
}
|
||||||
|
int s = 0;
|
||||||
|
if (journal.sector_info[journal.cur_sector].dirty)
|
||||||
|
{
|
||||||
|
prepare_journal_sector_write(journal, journal.cur_sector, sqes[s++], cb);
|
||||||
|
PRIV(op)->min_used_journal_sector = PRIV(op)->max_used_journal_sector = 1 + journal.cur_sector;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
PRIV(op)->min_used_journal_sector = PRIV(op)->max_used_journal_sector = 0;
|
||||||
|
}
|
||||||
if (!disable_fsync)
|
if (!disable_fsync)
|
||||||
{
|
{
|
||||||
BS_SUBMIT_GET_SQE(sqe, data);
|
ring_data_t *data = ((ring_data_t*)sqes[s]->user_data);
|
||||||
my_uring_prep_fsync(sqe, journal.fd, IORING_FSYNC_DATASYNC);
|
my_uring_prep_fsync(sqes[s++], journal.fd, IORING_FSYNC_DATASYNC);
|
||||||
data->iov = { 0 };
|
data->iov = { 0 };
|
||||||
data->callback = cb;
|
data->callback = cb;
|
||||||
PRIV(op)->min_used_journal_sector = PRIV(op)->max_used_journal_sector = 0;
|
}
|
||||||
PRIV(op)->pending_ops = 1;
|
PRIV(op)->pending_ops = s;
|
||||||
PRIV(op)->sync_state = SYNC_JOURNAL_SYNC_SENT;
|
PRIV(op)->sync_state = SYNC_JOURNAL_SYNC_SENT;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
|
@ -90,11 +112,20 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op)
|
||||||
// Prepare and submit journal entries
|
// Prepare and submit journal entries
|
||||||
auto it = PRIV(op)->sync_big_writes.begin();
|
auto it = PRIV(op)->sync_big_writes.begin();
|
||||||
int s = 0, cur_sector = -1;
|
int s = 0, cur_sector = -1;
|
||||||
|
if ((JOURNAL_BLOCK_SIZE - journal.in_sector_pos) < sizeof(journal_entry_big_write) &&
|
||||||
|
journal.sector_info[journal.cur_sector].dirty)
|
||||||
|
{
|
||||||
|
if (cur_sector == -1)
|
||||||
|
PRIV(op)->min_used_journal_sector = 1 + journal.cur_sector;
|
||||||
|
cur_sector = journal.cur_sector;
|
||||||
|
prepare_journal_sector_write(journal, cur_sector, sqe[s++], cb);
|
||||||
|
}
|
||||||
while (it != PRIV(op)->sync_big_writes.end())
|
while (it != PRIV(op)->sync_big_writes.end())
|
||||||
{
|
{
|
||||||
journal_entry_big_write *je = (journal_entry_big_write*)
|
journal_entry_big_write *je = (journal_entry_big_write*)
|
||||||
prefill_single_journal_entry(journal, JE_BIG_WRITE, sizeof(journal_entry_big_write));
|
prefill_single_journal_entry(journal, JE_BIG_WRITE, sizeof(journal_entry_big_write));
|
||||||
dirty_db[*it].journal_sector = journal.sector_info[journal.cur_sector].offset;
|
dirty_db[*it].journal_sector = journal.sector_info[journal.cur_sector].offset;
|
||||||
|
journal.sector_info[journal.cur_sector].dirty = false;
|
||||||
journal.used_sectors[journal.sector_info[journal.cur_sector].offset]++;
|
journal.used_sectors[journal.sector_info[journal.cur_sector].offset]++;
|
||||||
#ifdef BLOCKSTORE_DEBUG
|
#ifdef BLOCKSTORE_DEBUG
|
||||||
printf("journal offset %lu is used by %lu:%lu v%lu\n", dirty_db[*it].journal_sector, it->oid.inode, it->oid.stripe, it->version);
|
printf("journal offset %lu is used by %lu:%lu v%lu\n", dirty_db[*it].journal_sector, it->oid.inode, it->oid.stripe, it->version);
|
||||||
|
@ -112,7 +143,7 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op)
|
||||||
if (cur_sector == -1)
|
if (cur_sector == -1)
|
||||||
PRIV(op)->min_used_journal_sector = 1 + journal.cur_sector;
|
PRIV(op)->min_used_journal_sector = 1 + journal.cur_sector;
|
||||||
cur_sector = journal.cur_sector;
|
cur_sector = journal.cur_sector;
|
||||||
prepare_journal_sector_write(journal, sqe[s++], cb);
|
prepare_journal_sector_write(journal, cur_sector, sqe[s++], cb);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
PRIV(op)->max_used_journal_sector = 1 + journal.cur_sector;
|
PRIV(op)->max_used_journal_sector = 1 + journal.cur_sector;
|
||||||
|
@ -147,18 +178,7 @@ void blockstore_impl_t::handle_sync_event(ring_data_t *data, blockstore_op_t *op
|
||||||
if (PRIV(op)->pending_ops == 0)
|
if (PRIV(op)->pending_ops == 0)
|
||||||
{
|
{
|
||||||
// Release used journal sectors
|
// Release used journal sectors
|
||||||
if (PRIV(op)->min_used_journal_sector > 0)
|
release_journal_sectors(op);
|
||||||
{
|
|
||||||
uint64_t s = PRIV(op)->min_used_journal_sector;
|
|
||||||
while (1)
|
|
||||||
{
|
|
||||||
journal.sector_info[s-1].usage_count--;
|
|
||||||
if (s == PRIV(op)->max_used_journal_sector)
|
|
||||||
break;
|
|
||||||
s = 1 + s % journal.sector_count;
|
|
||||||
}
|
|
||||||
PRIV(op)->min_used_journal_sector = PRIV(op)->max_used_journal_sector = 0;
|
|
||||||
}
|
|
||||||
// Handle states
|
// Handle states
|
||||||
if (PRIV(op)->sync_state == SYNC_DATA_SYNC_SENT)
|
if (PRIV(op)->sync_state == SYNC_DATA_SYNC_SENT)
|
||||||
{
|
{
|
||||||
|
|
|
@ -137,7 +137,6 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
|
||||||
{
|
{
|
||||||
// Small (journaled) write
|
// Small (journaled) write
|
||||||
// First check if the journal has sufficient space
|
// First check if the journal has sufficient space
|
||||||
// FIXME Always two SQEs for now. Although it's possible to send 1 sometimes
|
|
||||||
blockstore_journal_check_t space_check(this);
|
blockstore_journal_check_t space_check(this);
|
||||||
if (unsynced_big_writes.size() && !space_check.check_available(op, unsynced_big_writes.size(), sizeof(journal_entry_big_write), 0)
|
if (unsynced_big_writes.size() && !space_check.check_available(op, unsynced_big_writes.size(), sizeof(journal_entry_big_write), 0)
|
||||||
|| !space_check.check_available(op, 1, sizeof(journal_entry_small_write), op->len + JOURNAL_STABILIZE_RESERVATION))
|
|| !space_check.check_available(op, 1, sizeof(journal_entry_small_write), op->len + JOURNAL_STABILIZE_RESERVATION))
|
||||||
|
@ -145,18 +144,34 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
// There is sufficient space. Get SQE(s)
|
// There is sufficient space. Get SQE(s)
|
||||||
BS_SUBMIT_GET_ONLY_SQE(sqe1);
|
struct io_uring_sqe *sqe1 = NULL;
|
||||||
|
if ((JOURNAL_BLOCK_SIZE - journal.in_sector_pos) < sizeof(journal_entry_small_write) &&
|
||||||
|
journal.sector_info[journal.cur_sector].dirty)
|
||||||
|
{
|
||||||
|
// Write current journal sector only if it's dirty and full
|
||||||
|
BS_SUBMIT_GET_SQE_DECL(sqe1);
|
||||||
|
}
|
||||||
struct io_uring_sqe *sqe2 = NULL;
|
struct io_uring_sqe *sqe2 = NULL;
|
||||||
struct ring_data_t *data2 = NULL;
|
|
||||||
if (op->len > 0)
|
if (op->len > 0)
|
||||||
{
|
{
|
||||||
BS_SUBMIT_GET_SQE_DECL(sqe2);
|
BS_SUBMIT_GET_SQE_DECL(sqe2);
|
||||||
data2 = ((ring_data_t*)sqe2->user_data);
|
|
||||||
}
|
}
|
||||||
// FIXME: Write journal sector here only if it is full. Otherwise, defer it until SYNC. This will help reduce WA
|
// Got SQEs. Prepare previous journal sector write if required
|
||||||
// Got SQEs. Prepare journal sector write
|
auto cb = [this, op](ring_data_t *data) { handle_write_event(data, op); };
|
||||||
|
if (sqe1)
|
||||||
|
{
|
||||||
|
prepare_journal_sector_write(journal, journal.cur_sector, sqe1, cb);
|
||||||
|
// FIXME rename to min/max _flushing
|
||||||
|
PRIV(op)->min_used_journal_sector = PRIV(op)->max_used_journal_sector = 1 + journal.cur_sector;
|
||||||
|
PRIV(op)->pending_ops++;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
PRIV(op)->min_used_journal_sector = PRIV(op)->max_used_journal_sector = 0;
|
||||||
|
}
|
||||||
|
// Then pre-fill journal entry
|
||||||
journal_entry_small_write *je = (journal_entry_small_write*)
|
journal_entry_small_write *je = (journal_entry_small_write*)
|
||||||
prefill_single_journal_entry(journal, JE_SMALL_WRITE, sizeof(struct journal_entry_small_write));
|
prefill_single_journal_entry(journal, JE_SMALL_WRITE, sizeof(journal_entry_small_write));
|
||||||
dirty_it->second.journal_sector = journal.sector_info[journal.cur_sector].offset;
|
dirty_it->second.journal_sector = journal.sector_info[journal.cur_sector].offset;
|
||||||
journal.used_sectors[journal.sector_info[journal.cur_sector].offset]++;
|
journal.used_sectors[journal.sector_info[journal.cur_sector].offset]++;
|
||||||
#ifdef BLOCKSTORE_DEBUG
|
#ifdef BLOCKSTORE_DEBUG
|
||||||
|
@ -172,9 +187,6 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
|
||||||
je->crc32_data = crc32c(0, op->buf, op->len);
|
je->crc32_data = crc32c(0, op->buf, op->len);
|
||||||
je->crc32 = je_crc32((journal_entry*)je);
|
je->crc32 = je_crc32((journal_entry*)je);
|
||||||
journal.crc32_last = je->crc32;
|
journal.crc32_last = je->crc32;
|
||||||
auto cb = [this, op](ring_data_t *data) { handle_write_event(data, op); };
|
|
||||||
prepare_journal_sector_write(journal, sqe1, cb);
|
|
||||||
PRIV(op)->min_used_journal_sector = PRIV(op)->max_used_journal_sector = 1 + journal.cur_sector;
|
|
||||||
if (op->len > 0)
|
if (op->len > 0)
|
||||||
{
|
{
|
||||||
// Prepare journal data write
|
// Prepare journal data write
|
||||||
|
@ -183,28 +195,34 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
|
||||||
// Copy data
|
// Copy data
|
||||||
memcpy(journal.buffer + journal.next_free, op->buf, op->len);
|
memcpy(journal.buffer + journal.next_free, op->buf, op->len);
|
||||||
}
|
}
|
||||||
|
ring_data_t *data2 = ((ring_data_t*)sqe2->user_data);
|
||||||
data2->iov = (struct iovec){ op->buf, op->len };
|
data2->iov = (struct iovec){ op->buf, op->len };
|
||||||
data2->callback = cb;
|
data2->callback = cb;
|
||||||
my_uring_prep_writev(
|
my_uring_prep_writev(
|
||||||
sqe2, journal.fd, &data2->iov, 1, journal.offset + journal.next_free
|
sqe2, journal.fd, &data2->iov, 1, journal.offset + journal.next_free
|
||||||
);
|
);
|
||||||
PRIV(op)->pending_ops = 2;
|
PRIV(op)->pending_ops++;
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
{
|
{
|
||||||
// Zero-length overwrite. Allowed to bump object version in EC placement groups without actually writing data
|
// Zero-length overwrite. Allowed to bump object version in EC placement groups without actually writing data
|
||||||
PRIV(op)->pending_ops = 1;
|
|
||||||
}
|
}
|
||||||
dirty_it->second.location = journal.next_free;
|
dirty_it->second.location = journal.next_free;
|
||||||
dirty_it->second.state = ST_J_SUBMITTED;
|
dirty_it->second.state = ST_J_SUBMITTED;
|
||||||
journal.next_free += op->len;
|
journal.next_free += op->len;
|
||||||
if (journal.next_free >= journal.len)
|
if (journal.next_free >= journal.len)
|
||||||
|
{
|
||||||
journal.next_free = JOURNAL_BLOCK_SIZE;
|
journal.next_free = JOURNAL_BLOCK_SIZE;
|
||||||
|
}
|
||||||
// Remember small write as unsynced
|
// Remember small write as unsynced
|
||||||
unsynced_small_writes.push_back((obj_ver_id){
|
unsynced_small_writes.push_back((obj_ver_id){
|
||||||
.oid = op->oid,
|
.oid = op->oid,
|
||||||
.version = op->version,
|
.version = op->version,
|
||||||
});
|
});
|
||||||
|
if (!PRIV(op)->pending_ops)
|
||||||
|
{
|
||||||
|
ack_write(op);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
@ -222,9 +240,17 @@ void blockstore_impl_t::handle_write_event(ring_data_t *data, blockstore_op_t *o
|
||||||
}
|
}
|
||||||
PRIV(op)->pending_ops--;
|
PRIV(op)->pending_ops--;
|
||||||
if (PRIV(op)->pending_ops == 0)
|
if (PRIV(op)->pending_ops == 0)
|
||||||
|
{
|
||||||
|
release_journal_sectors(op);
|
||||||
|
ack_write(op);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void blockstore_impl_t::release_journal_sectors(blockstore_op_t *op)
|
||||||
{
|
{
|
||||||
// Release used journal sectors
|
// Release used journal sectors
|
||||||
if (PRIV(op)->min_used_journal_sector > 0)
|
if (PRIV(op)->min_used_journal_sector > 0 &&
|
||||||
|
PRIV(op)->max_used_journal_sector > 0)
|
||||||
{
|
{
|
||||||
uint64_t s = PRIV(op)->min_used_journal_sector;
|
uint64_t s = PRIV(op)->min_used_journal_sector;
|
||||||
while (1)
|
while (1)
|
||||||
|
@ -236,6 +262,10 @@ void blockstore_impl_t::handle_write_event(ring_data_t *data, blockstore_op_t *o
|
||||||
}
|
}
|
||||||
PRIV(op)->min_used_journal_sector = PRIV(op)->max_used_journal_sector = 0;
|
PRIV(op)->min_used_journal_sector = PRIV(op)->max_used_journal_sector = 0;
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void blockstore_impl_t::ack_write(blockstore_op_t *op)
|
||||||
|
{
|
||||||
// Switch object state
|
// Switch object state
|
||||||
auto & dirty_entry = dirty_db[(obj_ver_id){
|
auto & dirty_entry = dirty_db[(obj_ver_id){
|
||||||
.oid = op->oid,
|
.oid = op->oid,
|
||||||
|
@ -260,7 +290,6 @@ void blockstore_impl_t::handle_write_event(ring_data_t *data, blockstore_op_t *o
|
||||||
op->retval = op->len;
|
op->retval = op->len;
|
||||||
FINISH_OP(op);
|
FINISH_OP(op);
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
int blockstore_impl_t::dequeue_del(blockstore_op_t *op)
|
int blockstore_impl_t::dequeue_del(blockstore_op_t *op)
|
||||||
{
|
{
|
||||||
|
@ -287,7 +316,7 @@ int blockstore_impl_t::dequeue_del(blockstore_op_t *op)
|
||||||
je->crc32 = je_crc32((journal_entry*)je);
|
je->crc32 = je_crc32((journal_entry*)je);
|
||||||
journal.crc32_last = je->crc32;
|
journal.crc32_last = je->crc32;
|
||||||
auto cb = [this, op](ring_data_t *data) { handle_write_event(data, op); };
|
auto cb = [this, op](ring_data_t *data) { handle_write_event(data, op); };
|
||||||
prepare_journal_sector_write(journal, sqe, cb);
|
prepare_journal_sector_write(journal, journal.cur_sector, sqe, cb);
|
||||||
PRIV(op)->min_used_journal_sector = PRIV(op)->max_used_journal_sector = 1 + journal.cur_sector;
|
PRIV(op)->min_used_journal_sector = PRIV(op)->max_used_journal_sector = 1 + journal.cur_sector;
|
||||||
PRIV(op)->pending_ops = 1;
|
PRIV(op)->pending_ops = 1;
|
||||||
dirty_it->second.state = ST_DEL_SUBMITTED;
|
dirty_it->second.state = ST_DEL_SUBMITTED;
|
||||||
|
|
Loading…
Reference in New Issue