vitastor/blockstore_flush.cpp

619 lines
20 KiB
C++

#include "blockstore.h"
journal_flusher_t::journal_flusher_t(int flusher_count, blockstore *bs)
{
this->bs = bs;
this->flusher_count = flusher_count;
active_flushers = 0;
active_until_sync = 0;
sync_threshold = flusher_count == 1 ? 1 : flusher_count/2;
journal_trim_interval = sync_threshold;
journal_trim_counter = 0;
journal_superblock = bs->journal.inmemory ? bs->journal.buffer : memalign(512, 512);
co = new journal_flusher_co[flusher_count];
for (int i = 0; i < flusher_count; i++)
{
co[i].bs = bs;
co[i].flusher = this;
}
}
journal_flusher_co::journal_flusher_co()
{
wait_state = 0;
simple_callback_r = [this](ring_data_t* data)
{
if (data->res != data->iov.iov_len)
{
throw std::runtime_error(
"data read operation failed during flush ("+std::to_string(data->res)+" != "+std::to_string(data->iov.iov_len)+
"). can't continue, sorry :-("
);
}
wait_count--;
};
simple_callback_w = [this](ring_data_t* data)
{
if (data->res != data->iov.iov_len)
{
throw std::runtime_error(
"write operation failed ("+std::to_string(data->res)+" != "+std::to_string(data->iov.iov_len)+
"). in-memory state is corrupted. AAAAAAAaaaaaaaaa!!!111"
);
}
wait_count--;
};
}
journal_flusher_t::~journal_flusher_t()
{
if (!bs->journal.inmemory)
free(journal_superblock);
delete[] co;
}
bool journal_flusher_t::is_active()
{
return active_flushers > 0 || flush_queue.size() > 0;
}
void journal_flusher_t::loop()
{
for (int i = 0; i < flusher_count; i++)
{
if (!active_flushers && !flush_queue.size())
{
return;
}
co[i].loop();
}
}
void journal_flusher_t::enqueue_flush(obj_ver_id ov)
{
auto it = flush_versions.find(ov.oid);
if (it != flush_versions.end())
{
if (it->second < ov.version)
it->second = ov.version;
}
else
{
flush_versions[ov.oid] = ov.version;
flush_queue.push_back(ov.oid);
}
}
void journal_flusher_t::unshift_flush(obj_ver_id ov)
{
auto it = flush_versions.find(ov.oid);
if (it != flush_versions.end())
{
if (it->second < ov.version)
it->second = ov.version;
}
else
{
flush_versions[ov.oid] = ov.version;
flush_queue.push_front(ov.oid);
}
}
#define await_sqe(label) \
resume_##label:\
sqe = bs->get_sqe();\
if (!sqe)\
{\
wait_state = label;\
return false;\
}\
data = ((ring_data_t*)sqe->user_data);
bool journal_flusher_co::loop()
{
// This is much better than implementing the whole function as an FSM
// Maybe I should consider a coroutine library like https://github.com/hnes/libaco ...
if (wait_state == 1)
goto resume_1;
else if (wait_state == 2)
goto resume_2;
else if (wait_state == 3)
goto resume_3;
else if (wait_state == 4)
goto resume_4;
else if (wait_state == 5)
goto resume_5;
else if (wait_state == 6)
goto resume_6;
else if (wait_state == 7)
goto resume_7;
else if (wait_state == 8)
goto resume_8;
else if (wait_state == 9)
goto resume_9;
else if (wait_state == 10)
goto resume_10;
else if (wait_state == 11)
goto resume_11;
else if (wait_state == 12)
goto resume_12;
else if (wait_state == 13)
goto resume_13;
else if (wait_state == 14)
goto resume_14;
else if (wait_state == 15)
goto resume_15;
resume_0:
if (!flusher->flush_queue.size())
{
wait_state = 0;
return true;
}
cur.oid = flusher->flush_queue.front();
cur.version = flusher->flush_versions[cur.oid];
flusher->flush_queue.pop_front();
flusher->flush_versions.erase(cur.oid);
dirty_end = bs->dirty_db.find(cur);
if (dirty_end != bs->dirty_db.end())
{
repeat_it = flusher->sync_to_repeat.find(cur.oid);
if (repeat_it != flusher->sync_to_repeat.end())
{
#ifdef BLOCKSTORE_DEBUG
printf("Postpone %lu:%lu v%lu\n", cur.oid.inode, cur.oid.stripe, cur.version);
#endif
// We don't flush different parts of history of the same object in parallel
// So we check if someone is already flushing this object
// In that case we set sync_to_repeat and pick another object
// Another coroutine will see it and re-queue the object after it finishes
if (repeat_it->second < cur.version)
repeat_it->second = cur.version;
goto resume_0;
}
else
flusher->sync_to_repeat[cur.oid] = 0;
#ifdef BLOCKSTORE_DEBUG
printf("Flushing %lu:%lu v%lu\n", cur.oid.inode, cur.oid.stripe, cur.version);
#endif
dirty_it = dirty_end;
flusher->active_flushers++;
flusher->active_until_sync++;
v.clear();
wait_count = 0;
copy_count = 0;
clean_loc = UINT64_MAX;
has_delete = false;
skip_copy = false;
while (1)
{
if (dirty_it->second.state == ST_J_STABLE && !skip_copy)
{
// First we submit all reads
offset = dirty_it->second.offset;
len = dirty_it->second.len;
it = v.begin();
while (1)
{
for (; it != v.end(); it++)
if (it->offset >= offset)
break;
if (it == v.end() || it->offset > offset)
{
submit_offset = dirty_it->second.location + offset - dirty_it->second.offset;
submit_len = it == v.end() || it->offset >= offset+len ? len : it->offset-offset;
it = v.insert(it, (copy_buffer_t){ .offset = offset, .len = submit_len, .buf = memalign(512, submit_len) });
copy_count++;
if (bs->journal.inmemory)
{
// Take it from memory
memcpy(v.back().buf, bs->journal.buffer + submit_offset, submit_len);
}
else
{
// Read it from disk
await_sqe(1);
data->iov = (struct iovec){ v.back().buf, (size_t)submit_len };
data->callback = simple_callback_r;
my_uring_prep_readv(
sqe, bs->journal.fd, &data->iov, 1, bs->journal.offset + submit_offset
);
wait_count++;
}
}
if (it == v.end() || it->offset+it->len >= offset+len)
{
break;
}
}
}
else if (dirty_it->second.state == ST_D_STABLE && !skip_copy)
{
// There is an unflushed big write. Copy small writes in its position
clean_loc = dirty_it->second.location;
skip_copy = true;
}
else if (dirty_it->second.state == ST_DEL_STABLE && !skip_copy)
{
// There is an unflushed delete
has_delete = true;
skip_copy = true;
}
else if (!IS_STABLE(dirty_it->second.state))
{
char err[1024];
snprintf(
err, 1024, "BUG: Unexpected dirty_entry %lu:%lu v%lu state during flush: %d",
dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version, dirty_it->second.state
);
throw std::runtime_error(err);
}
if (dirty_it == bs->dirty_db.begin())
{
break;
}
dirty_it--;
if (dirty_it->first.oid != cur.oid)
{
break;
}
}
if (copy_count == 0 && clean_loc == UINT64_MAX && !has_delete)
{
// Nothing to flush
flusher->active_flushers--;
flusher->active_until_sync--;
repeat_it = flusher->sync_to_repeat.find(cur.oid);
if (repeat_it->second > cur.version)
{
// Requeue version
flusher->unshift_flush({ .oid = cur.oid, .version = repeat_it->second });
}
flusher->sync_to_repeat.erase(repeat_it);
goto resume_0;
}
// Find it in clean_db
{
auto clean_it = bs->clean_db.find(cur.oid);
old_clean_loc = (clean_it != bs->clean_db.end() ? clean_it->second.location : UINT64_MAX);
old_clean_ver = (clean_it != bs->clean_db.end() ? clean_it->second.version : 0);
}
if (clean_loc == UINT64_MAX)
{
if (copy_count > 0 && has_delete || old_clean_loc == UINT64_MAX)
{
// Object not present at all. This is a bug.
char err[1024];
snprintf(
err, 1024, "BUG: Object %lu:%lu v%lu that we are trying to flush is not allocated on the data device",
cur.oid.inode, cur.oid.stripe, cur.version
);
throw std::runtime_error(err);
}
else
clean_loc = old_clean_loc;
}
else
has_delete = false;
// Also we need to submit metadata read(s). We do read-modify-write cycle(s) for every operation.
resume_2:
if (!modify_meta_read(clean_loc, meta_new, 2))
{
wait_state += 2;
return false;
}
if (old_clean_loc != UINT64_MAX && old_clean_loc != clean_loc)
{
resume_14:
if (!modify_meta_read(old_clean_loc, meta_old, 14))
{
wait_state += 14;
return false;
}
}
else
meta_old.submitted = false;
resume_3:
if (wait_count > 0)
{
wait_state = 3;
return false;
}
if (meta_new.submitted)
{
meta_new.it->second.state = 1;
bs->ringloop->wakeup(bs->ring_consumer);
}
if (meta_old.submitted)
{
meta_old.it->second.state = 1;
bs->ringloop->wakeup(bs->ring_consumer);
}
// Reads completed, submit writes
for (it = v.begin(); it != v.end(); it++)
{
await_sqe(4);
data->iov = (struct iovec){ it->buf, (size_t)it->len };
data->callback = simple_callback_w;
my_uring_prep_writev(
sqe, bs->data_fd, &data->iov, 1, bs->data_offset + clean_loc + it->offset
);
wait_count++;
}
resume_5:
// And metadata writes, but only after data writes complete
if (meta_new.it->second.state == 0 || wait_count > 0)
{
// metadata sector is still being read or data is still being written, wait for it
wait_state = 5;
return false;
}
if (old_clean_loc != UINT64_MAX && old_clean_loc != clean_loc)
{
if (meta_old.it->second.state == 0)
{
wait_state = 5;
return false;
}
((clean_disk_entry*)meta_old.it->second.buf)[meta_old.pos] = { 0 };
await_sqe(15);
data->iov = (struct iovec){ meta_old.it->second.buf, 512 };
data->callback = simple_callback_w;
my_uring_prep_writev(
sqe, bs->meta_fd, &data->iov, 1, bs->meta_offset + meta_old.sector
);
wait_count++;
}
((clean_disk_entry*)meta_new.it->second.buf)[meta_new.pos] = has_delete
? (clean_disk_entry){ 0 }
: (clean_disk_entry){
.oid = cur.oid,
.version = cur.version,
};
await_sqe(6);
data->iov = (struct iovec){ meta_new.it->second.buf, 512 };
data->callback = simple_callback_w;
my_uring_prep_writev(
sqe, bs->meta_fd, &data->iov, 1, bs->meta_offset + meta_new.sector
);
wait_count++;
resume_7:
if (wait_count > 0)
{
wait_state = 7;
return false;
}
// Done, free all buffers
meta_new.it->second.usage_count--;
if (meta_new.it->second.usage_count == 0)
{
free(meta_new.it->second.buf);
flusher->meta_sectors.erase(meta_new.it);
}
if (old_clean_loc != UINT64_MAX && old_clean_loc != clean_loc)
{
meta_old.it->second.usage_count--;
if (meta_old.it->second.usage_count == 0)
{
free(meta_old.it->second.buf);
flusher->meta_sectors.erase(meta_old.it);
}
}
for (it = v.begin(); it != v.end(); it++)
{
free(it->buf);
}
v.clear();
// And sync everything (in batches - not per each operation!)
flusher->active_until_sync--;
resume_8:
resume_9:
resume_10:
resume_11:
if (!fsync_batch())
{
return false;
}
// Update clean_db and dirty_db, free old data locations
update_clean_db();
// Clear unused part of the journal every <journal_trim_interval> flushes
if (!((++flusher->journal_trim_counter) % flusher->journal_trim_interval))
{
flusher->journal_trim_counter = 0;
if (bs->journal.trim())
{
// Update journal "superblock"
await_sqe(12);
data->callback = simple_callback_w;
*((journal_entry_start*)flusher->journal_superblock) = {
.crc32 = 0,
.magic = JOURNAL_MAGIC,
.type = JE_START,
.size = sizeof(journal_entry_start),
.reserved = 0,
.journal_start = bs->journal.used_start,
};
((journal_entry_start*)flusher->journal_superblock)->crc32 = je_crc32((journal_entry*)flusher->journal_superblock);
data->iov = (struct iovec){ flusher->journal_superblock, 512 };
my_uring_prep_writev(sqe, bs->journal.fd, &data->iov, 1, bs->journal.offset);
wait_count++;
resume_13:
if (wait_count > 0)
{
wait_state = 13;
return false;
}
}
}
// All done
#ifdef BLOCKSTORE_DEBUG
printf("Flushed %lu:%lu v%lu\n", cur.oid.inode, cur.oid.stripe, cur.version);
#endif
flusher->active_flushers--;
repeat_it = flusher->sync_to_repeat.find(cur.oid);
if (repeat_it->second > cur.version)
{
// Requeue version
flusher->unshift_flush({ .oid = cur.oid, .version = repeat_it->second });
}
flusher->sync_to_repeat.erase(repeat_it);
goto resume_0;
}
return true;
}
bool journal_flusher_co::modify_meta_read(uint64_t meta_loc, flusher_meta_write_t &wr, int wait_base)
{
if (wait_state == wait_base)
goto resume_0;
// But we must check if the same sector is already in memory.
// Another option is to keep all raw metadata in memory all the time. FIXME: Maybe add this mode.
// And yet another option is to use LSM trees for metadata, but it sophisticates everything a lot,
// so I'll avoid it as long as I can.
wr.sector = ((meta_loc >> bs->block_order) / (512 / sizeof(clean_disk_entry))) * 512;
wr.pos = ((meta_loc >> bs->block_order) % (512 / sizeof(clean_disk_entry)));
wr.it = flusher->meta_sectors.find(wr.sector);
if (wr.it == flusher->meta_sectors.end())
{
// Not in memory yet, read it
wr.it = flusher->meta_sectors.emplace(wr.sector, (meta_sector_t){
.offset = wr.sector,
.len = 512,
.state = 0, // 0 = not read yet
.buf = memalign(512, 512),
.usage_count = 1,
}).first;
await_sqe(0);
data->iov = (struct iovec){ wr.it->second.buf, 512 };
data->callback = simple_callback_r;
wr.submitted = true;
my_uring_prep_readv(
sqe, bs->meta_fd, &data->iov, 1, bs->meta_offset + wr.sector
);
wait_count++;
}
else
{
wr.submitted = false;
wr.it->second.usage_count++;
}
return true;
}
void journal_flusher_co::update_clean_db()
{
if (old_clean_loc != UINT64_MAX && old_clean_loc != clean_loc)
{
#ifdef BLOCKSTORE_DEBUG
printf("Free block %lu\n", old_clean_loc >> bs->block_order);
#endif
bs->data_alloc->set(old_clean_loc >> bs->block_order, false);
}
if (has_delete)
{
auto clean_it = bs->clean_db.find(cur.oid);
bs->clean_db.erase(clean_it);
bs->data_alloc->set(clean_loc >> bs->block_order, false);
clean_loc = UINT64_MAX;
}
else
{
bs->clean_db[cur.oid] = {
.version = cur.version,
.location = clean_loc,
};
}
dirty_it = dirty_end;
while (1)
{
if (IS_BIG_WRITE(dirty_it->second.state) && dirty_it->second.location != clean_loc)
{
#ifdef BLOCKSTORE_DEBUG
printf("Free block %lu\n", dirty_it->second.location >> bs->block_order);
#endif
bs->data_alloc->set(dirty_it->second.location >> bs->block_order, false);
}
#ifdef BLOCKSTORE_DEBUG
printf("remove usage of journal offset %lu by %lu:%lu v%lu\n", dirty_it->second.journal_sector, dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version);
#endif
int used = --bs->journal.used_sectors[dirty_it->second.journal_sector];
if (used == 0)
{
bs->journal.used_sectors.erase(dirty_it->second.journal_sector);
}
if (dirty_it == bs->dirty_db.begin())
{
break;
}
dirty_it--;
if (dirty_it->first.oid != cur.oid)
{
break;
}
}
// Then, basically, remove everything up to the current version from dirty_db...
if (dirty_it->first.oid != cur.oid)
dirty_it++;
bs->dirty_db.erase(dirty_it, std::next(dirty_end));
}
bool journal_flusher_co::fsync_batch()
{
if (wait_state == 8)
goto resume_8;
else if (wait_state == 9)
goto resume_9;
else if (wait_state == 10)
goto resume_10;
else if (wait_state == 11)
goto resume_11;
if (!bs->disable_fsync)
{
cur_sync = flusher->syncs.end();
if (cur_sync == flusher->syncs.begin() || cur_sync->state == 1)
cur_sync = flusher->syncs.emplace(flusher->syncs.end(), (flusher_sync_t){ .ready_count = 0, .state = 0 });
else
cur_sync--;
cur_sync->ready_count++;
if (cur_sync->ready_count >= flusher->sync_threshold ||
!flusher->active_until_sync && (!flusher->flush_queue.size() || flusher->active_flushers >= flusher->flusher_count))
{
// Sync batch is ready. Do it.
await_sqe(9);
data->callback = simple_callback_w;
data->iov = { 0 };
my_uring_prep_fsync(sqe, bs->data_fd, IORING_FSYNC_DATASYNC);
wait_count++;
if (bs->meta_fd != bs->data_fd)
{
await_sqe(10);
data->callback = simple_callback_w;
data->iov = { 0 };
my_uring_prep_fsync(sqe, bs->meta_fd, IORING_FSYNC_DATASYNC);
wait_count++;
}
resume_11:
if (wait_count > 0)
{
wait_state = 11;
return false;
}
// Sync completed. All previous coroutines waiting for it must be resumed
cur_sync->state = 1;
bs->ringloop->wakeup(bs->ring_consumer);
}
// Wait until someone else sends and completes a sync.
resume_8:
if (!cur_sync->state)
{
wait_state = 8;
return false;
}
cur_sync->ready_count--;
if (cur_sync->ready_count == 0)
{
flusher->syncs.erase(cur_sync);
}
}
return true;
}