OP_DELETE flushing
parent
a7a0946ba8
commit
f4d06ba102
|
@ -105,11 +105,11 @@ void journal_flusher_t::unshift_flush(obj_ver_id ov)
|
||||||
if (!sqe)\
|
if (!sqe)\
|
||||||
{\
|
{\
|
||||||
wait_state = label;\
|
wait_state = label;\
|
||||||
return;\
|
return false;\
|
||||||
}\
|
}\
|
||||||
data = ((ring_data_t*)sqe->user_data);
|
data = ((ring_data_t*)sqe->user_data);
|
||||||
|
|
||||||
void journal_flusher_co::loop()
|
bool journal_flusher_co::loop()
|
||||||
{
|
{
|
||||||
// This is much better than implementing the whole function as an FSM
|
// This is much better than implementing the whole function as an FSM
|
||||||
// Maybe I should consider a coroutine library like https://github.com/hnes/libaco ...
|
// Maybe I should consider a coroutine library like https://github.com/hnes/libaco ...
|
||||||
|
@ -139,11 +139,15 @@ void journal_flusher_co::loop()
|
||||||
goto resume_12;
|
goto resume_12;
|
||||||
else if (wait_state == 13)
|
else if (wait_state == 13)
|
||||||
goto resume_13;
|
goto resume_13;
|
||||||
|
else if (wait_state == 14)
|
||||||
|
goto resume_14;
|
||||||
|
else if (wait_state == 15)
|
||||||
|
goto resume_15;
|
||||||
resume_0:
|
resume_0:
|
||||||
if (!flusher->flush_queue.size())
|
if (!flusher->flush_queue.size())
|
||||||
{
|
{
|
||||||
wait_state = 0;
|
wait_state = 0;
|
||||||
return;
|
return true;
|
||||||
}
|
}
|
||||||
cur.oid = flusher->flush_queue.front();
|
cur.oid = flusher->flush_queue.front();
|
||||||
cur.version = flusher->flush_versions[cur.oid];
|
cur.version = flusher->flush_versions[cur.oid];
|
||||||
|
@ -178,6 +182,7 @@ resume_0:
|
||||||
wait_count = 0;
|
wait_count = 0;
|
||||||
copy_count = 0;
|
copy_count = 0;
|
||||||
clean_loc = UINT64_MAX;
|
clean_loc = UINT64_MAX;
|
||||||
|
has_delete = false;
|
||||||
skip_copy = false;
|
skip_copy = false;
|
||||||
while (1)
|
while (1)
|
||||||
{
|
{
|
||||||
|
@ -221,13 +226,16 @@ resume_0:
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
else if (dirty_it->second.state == ST_D_STABLE)
|
else if (dirty_it->second.state == ST_D_STABLE && !skip_copy)
|
||||||
{
|
{
|
||||||
// There is an unflushed big write. Copy small writes in its position
|
// There is an unflushed big write. Copy small writes in its position
|
||||||
if (!skip_copy)
|
clean_loc = dirty_it->second.location;
|
||||||
{
|
skip_copy = true;
|
||||||
clean_loc = dirty_it->second.location;
|
}
|
||||||
}
|
else if (dirty_it->second.state == ST_DEL_STABLE && !skip_copy)
|
||||||
|
{
|
||||||
|
// There is an unflushed delete
|
||||||
|
has_delete = true;
|
||||||
skip_copy = true;
|
skip_copy = true;
|
||||||
}
|
}
|
||||||
else if (!IS_STABLE(dirty_it->second.state))
|
else if (!IS_STABLE(dirty_it->second.state))
|
||||||
|
@ -249,7 +257,7 @@ resume_0:
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (copy_count == 0 && clean_loc == UINT64_MAX)
|
if (copy_count == 0 && clean_loc == UINT64_MAX && !has_delete)
|
||||||
{
|
{
|
||||||
// Nothing to flush
|
// Nothing to flush
|
||||||
flusher->active_flushers--;
|
flusher->active_flushers--;
|
||||||
|
@ -267,10 +275,11 @@ resume_0:
|
||||||
{
|
{
|
||||||
auto clean_it = bs->clean_db.find(cur.oid);
|
auto clean_it = bs->clean_db.find(cur.oid);
|
||||||
old_clean_loc = (clean_it != bs->clean_db.end() ? clean_it->second.location : UINT64_MAX);
|
old_clean_loc = (clean_it != bs->clean_db.end() ? clean_it->second.location : UINT64_MAX);
|
||||||
|
old_clean_ver = (clean_it != bs->clean_db.end() ? clean_it->second.version : 0);
|
||||||
}
|
}
|
||||||
if (clean_loc == UINT64_MAX)
|
if (clean_loc == UINT64_MAX)
|
||||||
{
|
{
|
||||||
if (old_clean_loc == UINT64_MAX)
|
if (copy_count > 0 && has_delete || old_clean_loc == UINT64_MAX)
|
||||||
{
|
{
|
||||||
// Object not present at all. This is a bug.
|
// Object not present at all. This is a bug.
|
||||||
char err[1024];
|
char err[1024];
|
||||||
|
@ -283,50 +292,41 @@ resume_0:
|
||||||
else
|
else
|
||||||
clean_loc = old_clean_loc;
|
clean_loc = old_clean_loc;
|
||||||
}
|
}
|
||||||
// Also we need to submit the metadata read. We do a read-modify-write for every operation.
|
else
|
||||||
// But we must check if the same sector is already in memory.
|
has_delete = false;
|
||||||
// Another option is to keep all raw metadata in memory all the time. FIXME: Maybe add this mode.
|
// Also we need to submit metadata read(s). We do read-modify-write cycle(s) for every operation.
|
||||||
// And yet another option is to use LSM trees for metadata, but it sophisticates everything a lot,
|
resume_2:
|
||||||
// so I'll avoid it as long as I can.
|
if (!modify_meta_read(clean_loc, meta_new, 2))
|
||||||
meta_sector = ((clean_loc >> bs->block_order) / (512 / sizeof(clean_disk_entry))) * 512;
|
|
||||||
meta_pos = ((clean_loc >> bs->block_order) % (512 / sizeof(clean_disk_entry)));
|
|
||||||
meta_it = flusher->meta_sectors.find(meta_sector);
|
|
||||||
if (meta_it == flusher->meta_sectors.end())
|
|
||||||
{
|
{
|
||||||
// Not in memory yet, read it
|
wait_state += 2;
|
||||||
meta_it = flusher->meta_sectors.emplace(meta_sector, (meta_sector_t){
|
return false;
|
||||||
.offset = meta_sector,
|
}
|
||||||
.len = 512,
|
if (old_clean_loc != UINT64_MAX && old_clean_loc != clean_loc)
|
||||||
.state = 0, // 0 = not read yet
|
{
|
||||||
.buf = memalign(512, 512),
|
resume_14:
|
||||||
.usage_count = 1,
|
if (!modify_meta_read(old_clean_loc, meta_old, 14))
|
||||||
}).first;
|
|
||||||
await_sqe(2);
|
|
||||||
data->iov = (struct iovec){ meta_it->second.buf, 512 };
|
|
||||||
data->callback = [this](ring_data_t* data)
|
|
||||||
{
|
{
|
||||||
if (data->res != data->iov.iov_len)
|
wait_state += 14;
|
||||||
{
|
return false;
|
||||||
throw std::runtime_error(
|
}
|
||||||
"metadata read operation failed ("+std::to_string(data->res)+" != "+std::to_string(data->iov.iov_len)+
|
|
||||||
"). can't continue, sorry :-("
|
|
||||||
);
|
|
||||||
}
|
|
||||||
meta_it->second.state = 1;
|
|
||||||
wait_count--;
|
|
||||||
};
|
|
||||||
my_uring_prep_readv(
|
|
||||||
sqe, bs->meta_fd, &data->iov, 1, bs->meta_offset + meta_sector
|
|
||||||
);
|
|
||||||
wait_count++;
|
|
||||||
}
|
}
|
||||||
else
|
else
|
||||||
meta_it->second.usage_count++;
|
meta_old.submitted = false;
|
||||||
resume_3:
|
resume_3:
|
||||||
if (wait_count > 0)
|
if (wait_count > 0)
|
||||||
{
|
{
|
||||||
wait_state = 3;
|
wait_state = 3;
|
||||||
return;
|
return false;
|
||||||
|
}
|
||||||
|
if (meta_new.submitted)
|
||||||
|
{
|
||||||
|
meta_new.it->second.state = 1;
|
||||||
|
bs->ringloop->wakeup(bs->ring_consumer);
|
||||||
|
}
|
||||||
|
if (meta_old.submitted)
|
||||||
|
{
|
||||||
|
meta_old.it->second.state = 1;
|
||||||
|
bs->ringloop->wakeup(bs->ring_consumer);
|
||||||
}
|
}
|
||||||
// Reads completed, submit writes
|
// Reads completed, submit writes
|
||||||
for (it = v.begin(); it != v.end(); it++)
|
for (it = v.begin(); it != v.end(); it++)
|
||||||
|
@ -340,167 +340,110 @@ resume_0:
|
||||||
wait_count++;
|
wait_count++;
|
||||||
}
|
}
|
||||||
resume_5:
|
resume_5:
|
||||||
// And a metadata write, but only after data writes complete
|
// And metadata writes, but only after data writes complete
|
||||||
if (meta_it->second.state == 0 || wait_count > 0)
|
if (meta_new.it->second.state == 0 || wait_count > 0)
|
||||||
{
|
{
|
||||||
// metadata sector is still being read or data is still being written, wait for it
|
// metadata sector is still being read or data is still being written, wait for it
|
||||||
wait_state = 5;
|
wait_state = 5;
|
||||||
return;
|
return false;
|
||||||
}
|
}
|
||||||
((clean_disk_entry*)meta_it->second.buf)[meta_pos] = {
|
if (old_clean_loc != UINT64_MAX && old_clean_loc != clean_loc)
|
||||||
.oid = cur.oid,
|
{
|
||||||
.version = cur.version,
|
if (meta_old.it->second.state == 0)
|
||||||
};
|
{
|
||||||
|
wait_state = 5;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
((clean_disk_entry*)meta_old.it->second.buf)[meta_old.pos] = { 0 };
|
||||||
|
await_sqe(15);
|
||||||
|
data->iov = (struct iovec){ meta_old.it->second.buf, 512 };
|
||||||
|
data->callback = simple_callback_w;
|
||||||
|
my_uring_prep_writev(
|
||||||
|
sqe, bs->meta_fd, &data->iov, 1, bs->meta_offset + meta_old.sector
|
||||||
|
);
|
||||||
|
wait_count++;
|
||||||
|
}
|
||||||
|
((clean_disk_entry*)meta_new.it->second.buf)[meta_new.pos] = has_delete
|
||||||
|
? (clean_disk_entry){ 0 }
|
||||||
|
: (clean_disk_entry){
|
||||||
|
.oid = cur.oid,
|
||||||
|
.version = cur.version,
|
||||||
|
};
|
||||||
await_sqe(6);
|
await_sqe(6);
|
||||||
data->iov = (struct iovec){ meta_it->second.buf, 512 };
|
data->iov = (struct iovec){ meta_new.it->second.buf, 512 };
|
||||||
data->callback = simple_callback_w;
|
data->callback = simple_callback_w;
|
||||||
my_uring_prep_writev(
|
my_uring_prep_writev(
|
||||||
sqe, bs->meta_fd, &data->iov, 1, bs->meta_offset + meta_sector
|
sqe, bs->meta_fd, &data->iov, 1, bs->meta_offset + meta_new.sector
|
||||||
);
|
);
|
||||||
wait_count++;
|
wait_count++;
|
||||||
resume_7:
|
resume_7:
|
||||||
if (wait_count > 0)
|
if (wait_count > 0)
|
||||||
{
|
{
|
||||||
wait_state = 7;
|
wait_state = 7;
|
||||||
return;
|
return false;
|
||||||
}
|
}
|
||||||
// Done, free all buffers
|
// Done, free all buffers
|
||||||
meta_it->second.usage_count--;
|
meta_new.it->second.usage_count--;
|
||||||
if (meta_it->second.usage_count == 0)
|
if (meta_new.it->second.usage_count == 0)
|
||||||
{
|
{
|
||||||
free(meta_it->second.buf);
|
free(meta_new.it->second.buf);
|
||||||
flusher->meta_sectors.erase(meta_it);
|
flusher->meta_sectors.erase(meta_new.it);
|
||||||
|
}
|
||||||
|
if (old_clean_loc != UINT64_MAX && old_clean_loc != clean_loc)
|
||||||
|
{
|
||||||
|
meta_old.it->second.usage_count--;
|
||||||
|
if (meta_old.it->second.usage_count == 0)
|
||||||
|
{
|
||||||
|
free(meta_old.it->second.buf);
|
||||||
|
flusher->meta_sectors.erase(meta_old.it);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
for (it = v.begin(); it != v.end(); it++)
|
for (it = v.begin(); it != v.end(); it++)
|
||||||
{
|
{
|
||||||
free(it->buf);
|
free(it->buf);
|
||||||
}
|
}
|
||||||
v.clear();
|
v.clear();
|
||||||
|
// And sync everything (in batches - not per each operation!)
|
||||||
flusher->active_until_sync--;
|
flusher->active_until_sync--;
|
||||||
if (!bs->disable_fsync)
|
resume_8:
|
||||||
|
resume_9:
|
||||||
|
resume_10:
|
||||||
|
resume_11:
|
||||||
|
if (!fsync_batch())
|
||||||
{
|
{
|
||||||
// And sync everything (in batches - not per each operation!)
|
return false;
|
||||||
cur_sync = flusher->syncs.end();
|
|
||||||
if (cur_sync == flusher->syncs.begin() || cur_sync->state == 1)
|
|
||||||
cur_sync = flusher->syncs.emplace(flusher->syncs.end(), (flusher_sync_t){ .ready_count = 0, .state = 0 });
|
|
||||||
else
|
|
||||||
cur_sync--;
|
|
||||||
cur_sync->ready_count++;
|
|
||||||
if (cur_sync->ready_count >= flusher->sync_threshold ||
|
|
||||||
!flusher->active_until_sync && (!flusher->flush_queue.size() || flusher->active_flushers >= flusher->flusher_count))
|
|
||||||
{
|
|
||||||
// Sync batch is ready. Do it.
|
|
||||||
await_sqe(9);
|
|
||||||
data->callback = simple_callback_w;
|
|
||||||
data->iov = { 0 };
|
|
||||||
my_uring_prep_fsync(sqe, bs->data_fd, IORING_FSYNC_DATASYNC);
|
|
||||||
wait_count++;
|
|
||||||
if (bs->meta_fd != bs->data_fd)
|
|
||||||
{
|
|
||||||
await_sqe(10);
|
|
||||||
data->callback = simple_callback_w;
|
|
||||||
data->iov = { 0 };
|
|
||||||
my_uring_prep_fsync(sqe, bs->meta_fd, IORING_FSYNC_DATASYNC);
|
|
||||||
wait_count++;
|
|
||||||
}
|
|
||||||
resume_11:
|
|
||||||
if (wait_count > 0)
|
|
||||||
{
|
|
||||||
wait_state = 11;
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
// Sync completed. All previous coroutines waiting for it must be resumed
|
|
||||||
cur_sync->state = 1;
|
|
||||||
bs->ringloop->wakeup(bs->ring_consumer);
|
|
||||||
}
|
|
||||||
// Wait until someone else sends and completes a sync.
|
|
||||||
resume_8:
|
|
||||||
if (!cur_sync->state)
|
|
||||||
{
|
|
||||||
wait_state = 8;
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
cur_sync->ready_count--;
|
|
||||||
if (cur_sync->ready_count == 0)
|
|
||||||
{
|
|
||||||
flusher->syncs.erase(cur_sync);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
// Update clean_db and dirty_db, free old data locations
|
// Update clean_db and dirty_db, free old data locations
|
||||||
if (old_clean_loc != clean_loc)
|
update_clean_db();
|
||||||
{
|
|
||||||
#ifdef BLOCKSTORE_DEBUG
|
|
||||||
printf("Free block %lu\n", old_clean_loc >> bs->block_order);
|
|
||||||
#endif
|
|
||||||
bs->data_alloc->set(old_clean_loc >> bs->block_order, false);
|
|
||||||
}
|
|
||||||
bs->clean_db[cur.oid] = {
|
|
||||||
.version = cur.version,
|
|
||||||
.location = clean_loc,
|
|
||||||
};
|
|
||||||
dirty_it = dirty_end;
|
|
||||||
while (1)
|
|
||||||
{
|
|
||||||
if (IS_BIG_WRITE(dirty_it->second.state) && dirty_it->second.location != clean_loc)
|
|
||||||
{
|
|
||||||
#ifdef BLOCKSTORE_DEBUG
|
|
||||||
printf("Free block %lu\n", dirty_it->second.location >> bs->block_order);
|
|
||||||
#endif
|
|
||||||
bs->data_alloc->set(dirty_it->second.location >> bs->block_order, false);
|
|
||||||
}
|
|
||||||
#ifdef BLOCKSTORE_DEBUG
|
|
||||||
printf("remove usage of journal offset %lu by %lu:%lu v%lu\n", dirty_it->second.journal_sector, dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version);
|
|
||||||
#endif
|
|
||||||
int used = --bs->journal.used_sectors[dirty_it->second.journal_sector];
|
|
||||||
if (used == 0)
|
|
||||||
{
|
|
||||||
bs->journal.used_sectors.erase(dirty_it->second.journal_sector);
|
|
||||||
}
|
|
||||||
if (dirty_it == bs->dirty_db.begin())
|
|
||||||
{
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
dirty_it--;
|
|
||||||
if (dirty_it->first.oid != cur.oid)
|
|
||||||
{
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// Then, basically, remove everything up to the current version from dirty_db...
|
|
||||||
if (dirty_it->first.oid != cur.oid)
|
|
||||||
dirty_it++;
|
|
||||||
bs->dirty_db.erase(dirty_it, std::next(dirty_end));
|
|
||||||
// Clear unused part of the journal every <journal_trim_interval> flushes
|
// Clear unused part of the journal every <journal_trim_interval> flushes
|
||||||
if (!((++flusher->journal_trim_counter) % flusher->journal_trim_interval))
|
if (!((++flusher->journal_trim_counter) % flusher->journal_trim_interval))
|
||||||
{
|
{
|
||||||
flusher->journal_trim_counter = 0;
|
flusher->journal_trim_counter = 0;
|
||||||
if (!bs->journal.trim())
|
if (bs->journal.trim())
|
||||||
{
|
{
|
||||||
goto do_not_trim;
|
// Update journal "superblock"
|
||||||
}
|
await_sqe(12);
|
||||||
// Update journal "superblock"
|
data->callback = simple_callback_w;
|
||||||
await_sqe(12);
|
*((journal_entry_start*)flusher->journal_superblock) = {
|
||||||
data->callback = simple_callback_w;
|
.crc32 = 0,
|
||||||
*((journal_entry_start*)flusher->journal_superblock) = {
|
.magic = JOURNAL_MAGIC,
|
||||||
.crc32 = 0,
|
.type = JE_START,
|
||||||
.magic = JOURNAL_MAGIC,
|
.size = sizeof(journal_entry_start),
|
||||||
.type = JE_START,
|
.reserved = 0,
|
||||||
.size = sizeof(journal_entry_start),
|
.journal_start = bs->journal.used_start,
|
||||||
.reserved = 0,
|
};
|
||||||
.journal_start = bs->journal.used_start,
|
((journal_entry_start*)flusher->journal_superblock)->crc32 = je_crc32((journal_entry*)flusher->journal_superblock);
|
||||||
};
|
data->iov = (struct iovec){ flusher->journal_superblock, 512 };
|
||||||
((journal_entry_start*)flusher->journal_superblock)->crc32 = je_crc32((journal_entry*)flusher->journal_superblock);
|
my_uring_prep_writev(sqe, bs->journal.fd, &data->iov, 1, bs->journal.offset);
|
||||||
data->iov = (struct iovec){ flusher->journal_superblock, 512 };
|
wait_count++;
|
||||||
my_uring_prep_writev(sqe, bs->journal.fd, &data->iov, 1, bs->journal.offset);
|
resume_13:
|
||||||
wait_count++;
|
if (wait_count > 0)
|
||||||
resume_13:
|
{
|
||||||
if (wait_count > 0)
|
wait_state = 13;
|
||||||
{
|
return false;
|
||||||
wait_state = 13;
|
}
|
||||||
return;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
do_not_trim:
|
|
||||||
// All done
|
// All done
|
||||||
#ifdef BLOCKSTORE_DEBUG
|
#ifdef BLOCKSTORE_DEBUG
|
||||||
printf("Flushed %lu:%lu v%lu\n", cur.oid.inode, cur.oid.stripe, cur.version);
|
printf("Flushed %lu:%lu v%lu\n", cur.oid.inode, cur.oid.stripe, cur.version);
|
||||||
|
@ -515,4 +458,161 @@ resume_0:
|
||||||
flusher->sync_to_repeat.erase(repeat_it);
|
flusher->sync_to_repeat.erase(repeat_it);
|
||||||
goto resume_0;
|
goto resume_0;
|
||||||
}
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool journal_flusher_co::modify_meta_read(uint64_t meta_loc, flusher_meta_write_t &wr, int wait_base)
|
||||||
|
{
|
||||||
|
if (wait_state == wait_base)
|
||||||
|
goto resume_0;
|
||||||
|
// But we must check if the same sector is already in memory.
|
||||||
|
// Another option is to keep all raw metadata in memory all the time. FIXME: Maybe add this mode.
|
||||||
|
// And yet another option is to use LSM trees for metadata, but it sophisticates everything a lot,
|
||||||
|
// so I'll avoid it as long as I can.
|
||||||
|
wr.sector = ((meta_loc >> bs->block_order) / (512 / sizeof(clean_disk_entry))) * 512;
|
||||||
|
wr.pos = ((meta_loc >> bs->block_order) % (512 / sizeof(clean_disk_entry)));
|
||||||
|
wr.it = flusher->meta_sectors.find(wr.sector);
|
||||||
|
if (wr.it == flusher->meta_sectors.end())
|
||||||
|
{
|
||||||
|
// Not in memory yet, read it
|
||||||
|
wr.it = flusher->meta_sectors.emplace(wr.sector, (meta_sector_t){
|
||||||
|
.offset = wr.sector,
|
||||||
|
.len = 512,
|
||||||
|
.state = 0, // 0 = not read yet
|
||||||
|
.buf = memalign(512, 512),
|
||||||
|
.usage_count = 1,
|
||||||
|
}).first;
|
||||||
|
await_sqe(0);
|
||||||
|
data->iov = (struct iovec){ wr.it->second.buf, 512 };
|
||||||
|
data->callback = simple_callback_r;
|
||||||
|
wr.submitted = true;
|
||||||
|
my_uring_prep_readv(
|
||||||
|
sqe, bs->meta_fd, &data->iov, 1, bs->meta_offset + wr.sector
|
||||||
|
);
|
||||||
|
wait_count++;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
wr.submitted = false;
|
||||||
|
wr.it->second.usage_count++;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
void journal_flusher_co::update_clean_db()
|
||||||
|
{
|
||||||
|
if (old_clean_loc != UINT64_MAX && old_clean_loc != clean_loc)
|
||||||
|
{
|
||||||
|
#ifdef BLOCKSTORE_DEBUG
|
||||||
|
printf("Free block %lu\n", old_clean_loc >> bs->block_order);
|
||||||
|
#endif
|
||||||
|
bs->data_alloc->set(old_clean_loc >> bs->block_order, false);
|
||||||
|
}
|
||||||
|
if (has_delete)
|
||||||
|
{
|
||||||
|
auto clean_it = bs->clean_db.find(cur.oid);
|
||||||
|
bs->clean_db.erase(clean_it);
|
||||||
|
bs->data_alloc->set(clean_loc >> bs->block_order, false);
|
||||||
|
clean_loc = UINT64_MAX;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
bs->clean_db[cur.oid] = {
|
||||||
|
.version = cur.version,
|
||||||
|
.location = clean_loc,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
dirty_it = dirty_end;
|
||||||
|
while (1)
|
||||||
|
{
|
||||||
|
if (IS_BIG_WRITE(dirty_it->second.state) && dirty_it->second.location != clean_loc)
|
||||||
|
{
|
||||||
|
#ifdef BLOCKSTORE_DEBUG
|
||||||
|
printf("Free block %lu\n", dirty_it->second.location >> bs->block_order);
|
||||||
|
#endif
|
||||||
|
bs->data_alloc->set(dirty_it->second.location >> bs->block_order, false);
|
||||||
|
}
|
||||||
|
#ifdef BLOCKSTORE_DEBUG
|
||||||
|
printf("remove usage of journal offset %lu by %lu:%lu v%lu\n", dirty_it->second.journal_sector, dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version);
|
||||||
|
#endif
|
||||||
|
int used = --bs->journal.used_sectors[dirty_it->second.journal_sector];
|
||||||
|
if (used == 0)
|
||||||
|
{
|
||||||
|
bs->journal.used_sectors.erase(dirty_it->second.journal_sector);
|
||||||
|
}
|
||||||
|
if (dirty_it == bs->dirty_db.begin())
|
||||||
|
{
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
dirty_it--;
|
||||||
|
if (dirty_it->first.oid != cur.oid)
|
||||||
|
{
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Then, basically, remove everything up to the current version from dirty_db...
|
||||||
|
if (dirty_it->first.oid != cur.oid)
|
||||||
|
dirty_it++;
|
||||||
|
bs->dirty_db.erase(dirty_it, std::next(dirty_end));
|
||||||
|
}
|
||||||
|
|
||||||
|
bool journal_flusher_co::fsync_batch()
|
||||||
|
{
|
||||||
|
if (wait_state == 8)
|
||||||
|
goto resume_8;
|
||||||
|
else if (wait_state == 9)
|
||||||
|
goto resume_9;
|
||||||
|
else if (wait_state == 10)
|
||||||
|
goto resume_10;
|
||||||
|
else if (wait_state == 11)
|
||||||
|
goto resume_11;
|
||||||
|
if (!bs->disable_fsync)
|
||||||
|
{
|
||||||
|
cur_sync = flusher->syncs.end();
|
||||||
|
if (cur_sync == flusher->syncs.begin() || cur_sync->state == 1)
|
||||||
|
cur_sync = flusher->syncs.emplace(flusher->syncs.end(), (flusher_sync_t){ .ready_count = 0, .state = 0 });
|
||||||
|
else
|
||||||
|
cur_sync--;
|
||||||
|
cur_sync->ready_count++;
|
||||||
|
if (cur_sync->ready_count >= flusher->sync_threshold ||
|
||||||
|
!flusher->active_until_sync && (!flusher->flush_queue.size() || flusher->active_flushers >= flusher->flusher_count))
|
||||||
|
{
|
||||||
|
// Sync batch is ready. Do it.
|
||||||
|
await_sqe(9);
|
||||||
|
data->callback = simple_callback_w;
|
||||||
|
data->iov = { 0 };
|
||||||
|
my_uring_prep_fsync(sqe, bs->data_fd, IORING_FSYNC_DATASYNC);
|
||||||
|
wait_count++;
|
||||||
|
if (bs->meta_fd != bs->data_fd)
|
||||||
|
{
|
||||||
|
await_sqe(10);
|
||||||
|
data->callback = simple_callback_w;
|
||||||
|
data->iov = { 0 };
|
||||||
|
my_uring_prep_fsync(sqe, bs->meta_fd, IORING_FSYNC_DATASYNC);
|
||||||
|
wait_count++;
|
||||||
|
}
|
||||||
|
resume_11:
|
||||||
|
if (wait_count > 0)
|
||||||
|
{
|
||||||
|
wait_state = 11;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
// Sync completed. All previous coroutines waiting for it must be resumed
|
||||||
|
cur_sync->state = 1;
|
||||||
|
bs->ringloop->wakeup(bs->ring_consumer);
|
||||||
|
}
|
||||||
|
// Wait until someone else sends and completes a sync.
|
||||||
|
resume_8:
|
||||||
|
if (!cur_sync->state)
|
||||||
|
{
|
||||||
|
wait_state = 8;
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
cur_sync->ready_count--;
|
||||||
|
if (cur_sync->ready_count == 0)
|
||||||
|
{
|
||||||
|
flusher->syncs.erase(cur_sync);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
|
|
|
@ -18,6 +18,13 @@ struct flusher_sync_t
|
||||||
int state;
|
int state;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct flusher_meta_write_t
|
||||||
|
{
|
||||||
|
uint64_t sector, pos;
|
||||||
|
bool submitted;
|
||||||
|
std::map<uint64_t, meta_sector_t>::iterator it;
|
||||||
|
};
|
||||||
|
|
||||||
class journal_flusher_t;
|
class journal_flusher_t;
|
||||||
|
|
||||||
// Journal flusher coroutine
|
// Journal flusher coroutine
|
||||||
|
@ -28,21 +35,24 @@ class journal_flusher_co
|
||||||
int wait_state, wait_count;
|
int wait_state, wait_count;
|
||||||
struct io_uring_sqe *sqe;
|
struct io_uring_sqe *sqe;
|
||||||
struct ring_data_t *data;
|
struct ring_data_t *data;
|
||||||
bool skip_copy;
|
bool skip_copy, has_delete;
|
||||||
obj_ver_id cur;
|
obj_ver_id cur;
|
||||||
std::map<obj_ver_id, dirty_entry>::iterator dirty_it, dirty_start, dirty_end;
|
std::map<obj_ver_id, dirty_entry>::iterator dirty_it, dirty_start, dirty_end;
|
||||||
std::vector<copy_buffer_t> v;
|
std::vector<copy_buffer_t> v;
|
||||||
std::vector<copy_buffer_t>::iterator it;
|
std::vector<copy_buffer_t>::iterator it;
|
||||||
int copy_count;
|
int copy_count;
|
||||||
uint64_t offset, len, submit_offset, submit_len, clean_loc, old_clean_loc, meta_sector, meta_pos;
|
uint64_t offset, len, submit_offset, submit_len, clean_loc, old_clean_loc, old_clean_ver;
|
||||||
std::map<uint64_t, meta_sector_t>::iterator meta_it;
|
flusher_meta_write_t meta_old, meta_new;
|
||||||
std::map<object_id, uint64_t>::iterator repeat_it;
|
std::map<object_id, uint64_t>::iterator repeat_it;
|
||||||
std::function<void(ring_data_t*)> simple_callback_r, simple_callback_w;
|
std::function<void(ring_data_t*)> simple_callback_r, simple_callback_w;
|
||||||
std::list<flusher_sync_t>::iterator cur_sync;
|
std::list<flusher_sync_t>::iterator cur_sync;
|
||||||
friend class journal_flusher_t;
|
friend class journal_flusher_t;
|
||||||
|
bool modify_meta_read(uint64_t meta_loc, flusher_meta_write_t &wr, int wait_base);
|
||||||
|
void update_clean_db();
|
||||||
|
bool fsync_batch();
|
||||||
public:
|
public:
|
||||||
journal_flusher_co();
|
journal_flusher_co();
|
||||||
void loop();
|
bool loop();
|
||||||
};
|
};
|
||||||
|
|
||||||
// Journal flusher itself
|
// Journal flusher itself
|
||||||
|
|
|
@ -56,7 +56,7 @@ int blockstore_init_meta::loop()
|
||||||
}
|
}
|
||||||
if (prev_done)
|
if (prev_done)
|
||||||
{
|
{
|
||||||
int count = 512 / sizeof(clean_disk_entry);
|
unsigned count = 512 / sizeof(clean_disk_entry);
|
||||||
for (int sector = 0; sector < done_len; sector += 512)
|
for (int sector = 0; sector < done_len; sector += 512)
|
||||||
{
|
{
|
||||||
clean_disk_entry *entries = (clean_disk_entry*)(metadata_buffer + (prev_done == 2 ? bs->metadata_buf_size : 0) + sector);
|
clean_disk_entry *entries = (clean_disk_entry*)(metadata_buffer + (prev_done == 2 ? bs->metadata_buf_size : 0) + sector);
|
||||||
|
@ -79,7 +79,7 @@ int blockstore_init_meta::loop()
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
void blockstore_init_meta::handle_entries(struct clean_disk_entry* entries, int count, int block_order)
|
void blockstore_init_meta::handle_entries(struct clean_disk_entry* entries, unsigned count, int block_order)
|
||||||
{
|
{
|
||||||
for (unsigned i = 0; i < count; i++)
|
for (unsigned i = 0; i < count; i++)
|
||||||
{
|
{
|
||||||
|
@ -106,10 +106,12 @@ void blockstore_init_meta::handle_entries(struct clean_disk_entry* entries, int
|
||||||
.location = (done_cnt+i) << block_order,
|
.location = (done_cnt+i) << block_order,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
#ifdef BLOCKSTORE_DEBUG
|
|
||||||
else
|
else
|
||||||
|
{
|
||||||
|
#ifdef BLOCKSTORE_DEBUG
|
||||||
printf("Old clean entry %lu: %lu:%lu v%lu\n", done_cnt+i, entries[i].oid.inode, entries[i].oid.stripe, entries[i].version);
|
printf("Old clean entry %lu: %lu:%lu v%lu\n", done_cnt+i, entries[i].oid.inode, entries[i].oid.stripe, entries[i].version);
|
||||||
#endif
|
#endif
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -11,7 +11,7 @@ class blockstore_init_meta
|
||||||
uint64_t entries_loaded = 0;
|
uint64_t entries_loaded = 0;
|
||||||
struct io_uring_sqe *sqe;
|
struct io_uring_sqe *sqe;
|
||||||
struct ring_data_t *data;
|
struct ring_data_t *data;
|
||||||
void handle_entries(struct clean_disk_entry* entries, int count, int block_order);
|
void handle_entries(struct clean_disk_entry* entries, unsigned count, int block_order);
|
||||||
void handle_event(ring_data_t *data);
|
void handle_event(ring_data_t *data);
|
||||||
public:
|
public:
|
||||||
blockstore_init_meta(blockstore *bs);
|
blockstore_init_meta(blockstore *bs);
|
||||||
|
|
|
@ -22,10 +22,13 @@
|
||||||
// Stabilize delete:
|
// Stabilize delete:
|
||||||
// 1) Remove metadata entry and sync it
|
// 1) Remove metadata entry and sync it
|
||||||
// 2) Remove dirty_db entry and clear previous journal entries
|
// 2) Remove dirty_db entry and clear previous journal entries
|
||||||
// Note that it will lead to problems in a degraded cluster, because deleting 2 of 3 replicas
|
// We have 2 problems here:
|
||||||
// and restarting the last replica will then result in extra "missing" objects. To solve that
|
// - In the cluster environment, we must store the "tombstones" of deleted objects until
|
||||||
// we need to store the "tombstones" of deleted objects. We can't do that with current simple
|
// all replicas (not just quorum) agrees about their deletion. That is, "stabilize" is
|
||||||
// metadata storage so we'll skip TRIM implementation for now.
|
// not possible for deletes in degraded placement groups
|
||||||
|
// - With simple "fixed" metadata tables we can't just clear the metadata entry of the latest
|
||||||
|
// object version. We must clear all previous entries, too.
|
||||||
|
// FIXME Fix both problems - probably, by switching from "fixed" metadata tables to "dynamic"
|
||||||
|
|
||||||
// AND We must do it in batches, for the sake of reduced fsync call count
|
// AND We must do it in batches, for the sake of reduced fsync call count
|
||||||
// AND We must know what we stabilize. Basic workflow is like:
|
// AND We must know what we stabilize. Basic workflow is like:
|
||||||
|
|
Loading…
Reference in New Issue