Extract disk opening functions to separate module

rm-left-on-dead
Vitaliy Filippov 2022-06-30 00:22:48 +03:00
parent 30907852c2
commit dfd80626bd
15 changed files with 561 additions and 517 deletions

View File

@ -64,7 +64,7 @@ include_directories(
# libvitastor_blk.so
add_library(vitastor_blk SHARED
allocator.cpp blockstore.cpp blockstore_impl.cpp blockstore_init.cpp blockstore_open.cpp blockstore_journal.cpp blockstore_read.cpp
allocator.cpp blockstore.cpp blockstore_impl.cpp blockstore_disk.cpp blockstore_init.cpp blockstore_open.cpp blockstore_journal.cpp blockstore_read.cpp
blockstore_write.cpp blockstore_sync.cpp blockstore_stable.cpp blockstore_rollback.cpp blockstore_flush.cpp crc32c.c ringloop.cpp
)
target_link_libraries(vitastor_blk

322
src/blockstore_disk.cpp Normal file
View File

@ -0,0 +1,322 @@
// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 (see README.md for details)
#include <sys/file.h>
#include <stdexcept>
#include "blockstore_impl.h"
#include "blockstore_disk.h"
static uint32_t is_power_of_two(uint64_t value)
{
uint32_t l = 0;
while (value > 1)
{
if (value & 1)
{
return 64;
}
value = value >> 1;
l++;
}
return l;
}
void blockstore_disk_t::parse_config(std::map<std::string, std::string> & config)
{
// Parse
if (config["disable_device_lock"] == "true" || config["disable_device_lock"] == "1" || config["disable_device_lock"] == "yes")
{
disable_flock = true;
}
cfg_journal_size = strtoull(config["journal_size"].c_str(), NULL, 10);
data_device = config["data_device"];
data_offset = strtoull(config["data_offset"].c_str(), NULL, 10);
cfg_data_size = strtoull(config["data_size"].c_str(), NULL, 10);
meta_device = config["meta_device"];
meta_offset = strtoull(config["meta_offset"].c_str(), NULL, 10);
data_block_size = strtoull(config["block_size"].c_str(), NULL, 10);
journal_device = config["journal_device"];
journal_offset = strtoull(config["journal_offset"].c_str(), NULL, 10);
disk_alignment = strtoull(config["disk_alignment"].c_str(), NULL, 10);
journal_block_size = strtoull(config["journal_block_size"].c_str(), NULL, 10);
meta_block_size = strtoull(config["meta_block_size"].c_str(), NULL, 10);
bitmap_granularity = strtoull(config["bitmap_granularity"].c_str(), NULL, 10);
// Validate
if (!data_block_size)
{
data_block_size = (1 << DEFAULT_DATA_BLOCK_ORDER);
}
if ((block_order = is_power_of_two(data_block_size)) >= 64 || data_block_size < MIN_DATA_BLOCK_SIZE || data_block_size >= MAX_DATA_BLOCK_SIZE)
{
throw std::runtime_error("Bad block size");
}
if (!disk_alignment)
{
disk_alignment = 4096;
}
else if (disk_alignment % DIRECT_IO_ALIGNMENT)
{
throw std::runtime_error("disk_alignment must be a multiple of "+std::to_string(DIRECT_IO_ALIGNMENT));
}
if (!journal_block_size)
{
journal_block_size = 4096;
}
else if (journal_block_size % DIRECT_IO_ALIGNMENT)
{
throw std::runtime_error("journal_block_size must be a multiple of "+std::to_string(DIRECT_IO_ALIGNMENT));
}
if (!meta_block_size)
{
meta_block_size = 4096;
}
else if (meta_block_size % DIRECT_IO_ALIGNMENT)
{
throw std::runtime_error("meta_block_size must be a multiple of "+std::to_string(DIRECT_IO_ALIGNMENT));
}
if (data_offset % disk_alignment)
{
throw std::runtime_error("data_offset must be a multiple of disk_alignment = "+std::to_string(disk_alignment));
}
if (!bitmap_granularity)
{
bitmap_granularity = DEFAULT_BITMAP_GRANULARITY;
}
else if (bitmap_granularity % disk_alignment)
{
throw std::runtime_error("Sparse write tracking granularity must be a multiple of disk_alignment = "+std::to_string(disk_alignment));
}
if (data_block_size % bitmap_granularity)
{
throw std::runtime_error("Block size must be a multiple of sparse write tracking granularity");
}
if (journal_device == meta_device || meta_device == "" && journal_device == data_device)
{
journal_device = "";
}
if (meta_device == data_device)
{
meta_device = "";
}
if (meta_offset % meta_block_size)
{
throw std::runtime_error("meta_offset must be a multiple of meta_block_size = "+std::to_string(meta_block_size));
}
if (journal_offset % journal_block_size)
{
throw std::runtime_error("journal_offset must be a multiple of journal_block_size = "+std::to_string(journal_block_size));
}
clean_entry_bitmap_size = data_block_size / bitmap_granularity / 8;
clean_entry_size = sizeof(clean_disk_entry) + 2*clean_entry_bitmap_size;
}
void blockstore_disk_t::calc_lengths()
{
// data
data_len = data_device_size - data_offset;
if (data_fd == meta_fd && data_offset < meta_offset)
{
data_len = meta_offset - data_offset;
}
if (data_fd == journal_fd && data_offset < journal_offset)
{
data_len = data_len < journal_offset-data_offset
? data_len : journal_offset-data_offset;
}
if (cfg_data_size != 0)
{
if (data_len < cfg_data_size)
{
throw std::runtime_error("Data area ("+std::to_string(data_len)+
" bytes) is smaller than configured size ("+std::to_string(cfg_data_size)+" bytes)");
}
data_len = cfg_data_size;
}
// meta
uint64_t meta_area_size = (meta_fd == data_fd ? data_device_size : meta_device_size) - meta_offset;
if (meta_fd == data_fd && meta_offset <= data_offset)
{
meta_area_size = data_offset - meta_offset;
}
if (meta_fd == journal_fd && meta_offset <= journal_offset)
{
meta_area_size = meta_area_size < journal_offset-meta_offset
? meta_area_size : journal_offset-meta_offset;
}
// journal
journal_len = (journal_fd == data_fd ? data_device_size : (journal_fd == meta_fd ? meta_device_size : journal_device_size)) - journal_offset;
if (journal_fd == data_fd && journal_offset <= data_offset)
{
journal_len = data_offset - journal_offset;
}
if (journal_fd == meta_fd && journal_offset <= meta_offset)
{
journal_len = journal_len < meta_offset-journal_offset
? journal_len : meta_offset-journal_offset;
}
// required metadata size
block_count = data_len / data_block_size;
meta_len = (1 + (block_count - 1 + meta_block_size / clean_entry_size) / (meta_block_size / clean_entry_size)) * meta_block_size;
if (meta_area_size < meta_len)
{
throw std::runtime_error("Metadata area is too small, need at least "+std::to_string(meta_len)+" bytes");
}
// requested journal size
if (cfg_journal_size > journal_len)
{
throw std::runtime_error("Requested journal_size is too large");
}
else if (cfg_journal_size > 0)
{
journal_len = cfg_journal_size;
}
if (journal_len < MIN_JOURNAL_SIZE)
{
throw std::runtime_error("Journal is too small, need at least "+std::to_string(MIN_JOURNAL_SIZE)+" bytes");
}
}
static void check_size(int fd, uint64_t *size, uint64_t *sectsize, std::string name)
{
int sect;
struct stat st;
if (fstat(fd, &st) < 0)
{
throw std::runtime_error("Failed to stat "+name);
}
if (S_ISREG(st.st_mode))
{
*size = st.st_size;
if (sectsize)
{
*sectsize = st.st_blksize;
}
}
else if (S_ISBLK(st.st_mode))
{
if (ioctl(fd, BLKGETSIZE64, size) < 0 ||
ioctl(fd, BLKSSZGET, &sect) < 0)
{
throw std::runtime_error("Failed to get "+name+" size or block size: "+strerror(errno));
}
if (sectsize)
{
*sectsize = sect;
}
}
else
{
throw std::runtime_error(name+" is neither a file nor a block device");
}
}
void blockstore_disk_t::open_data()
{
data_fd = open(data_device.c_str(), O_DIRECT|O_RDWR);
if (data_fd == -1)
{
throw std::runtime_error("Failed to open data device");
}
check_size(data_fd, &data_device_size, &data_device_sect, "data device");
if (disk_alignment % data_device_sect)
{
throw std::runtime_error(
"disk_alignment ("+std::to_string(disk_alignment)+
") is not a multiple of data device sector size ("+std::to_string(data_device_sect)+")"
);
}
if (data_offset >= data_device_size)
{
throw std::runtime_error("data_offset exceeds device size = "+std::to_string(data_device_size));
}
if (!disable_flock && flock(data_fd, LOCK_EX|LOCK_NB) != 0)
{
throw std::runtime_error(std::string("Failed to lock data device: ") + strerror(errno));
}
}
void blockstore_disk_t::open_meta()
{
if (meta_device != "")
{
meta_offset = 0;
meta_fd = open(meta_device.c_str(), O_DIRECT|O_RDWR);
if (meta_fd == -1)
{
throw std::runtime_error("Failed to open metadata device");
}
check_size(meta_fd, &meta_device_size, &meta_device_sect, "metadata device");
if (meta_offset >= meta_device_size)
{
throw std::runtime_error("meta_offset exceeds device size = "+std::to_string(meta_device_size));
}
if (!disable_flock && flock(meta_fd, LOCK_EX|LOCK_NB) != 0)
{
throw std::runtime_error(std::string("Failed to lock metadata device: ") + strerror(errno));
}
}
else
{
meta_fd = data_fd;
meta_device_sect = data_device_sect;
meta_device_size = 0;
if (meta_offset >= data_device_size)
{
throw std::runtime_error("meta_offset exceeds device size = "+std::to_string(data_device_size));
}
}
if (meta_block_size % meta_device_sect)
{
throw std::runtime_error(
"meta_block_size ("+std::to_string(meta_block_size)+
") is not a multiple of data device sector size ("+std::to_string(meta_device_sect)+")"
);
}
}
void blockstore_disk_t::open_journal()
{
if (journal_device != "")
{
journal_fd = open(journal_device.c_str(), O_DIRECT|O_RDWR);
if (journal_fd == -1)
{
throw std::runtime_error("Failed to open journal device");
}
check_size(journal_fd, &journal_device_size, &journal_device_sect, "journal device");
if (!disable_flock && flock(journal_fd, LOCK_EX|LOCK_NB) != 0)
{
throw std::runtime_error(std::string("Failed to lock journal device: ") + strerror(errno));
}
}
else
{
journal_fd = meta_fd;
journal_device_sect = meta_device_sect;
journal_device_size = 0;
if (journal_offset >= data_device_size)
{
throw std::runtime_error("journal_offset exceeds device size");
}
}
if (journal_block_size % journal_device_sect)
{
throw std::runtime_error(
"journal_block_size ("+std::to_string(journal_block_size)+
") is not a multiple of journal device sector size ("+std::to_string(journal_device_sect)+")"
);
}
}
void blockstore_disk_t::close_all()
{
if (data_fd >= 0)
close(data_fd);
if (meta_fd >= 0 && meta_fd != data_fd)
close(meta_fd);
if (journal_fd >= 0 && journal_fd != meta_fd)
close(journal_fd);
data_fd = meta_fd = journal_fd = -1;
}

42
src/blockstore_disk.h Normal file
View File

@ -0,0 +1,42 @@
// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 (see README.md for details)
#pragma once
#include <stdint.h>
#include <string>
#include <map>
struct blockstore_disk_t
{
std::string data_device, meta_device, journal_device;
uint32_t data_block_size;
uint64_t cfg_journal_size, cfg_data_size;
// Required write alignment and journal/metadata/data areas' location alignment
uint32_t disk_alignment = 4096;
// Journal block size - minimum_io_size of the journal device is the best choice
uint64_t journal_block_size = 4096;
// Metadata block size - minimum_io_size of the metadata device is the best choice
uint64_t meta_block_size = 4096;
// Sparse write tracking granularity. 4 KB is a good choice. Must be a multiple of disk_alignment
uint64_t bitmap_granularity = 4096;
// By default, Blockstore locks all opened devices exclusively. This option can be used to disable locking
bool disable_flock = false;
int meta_fd = -1, data_fd = -1, journal_fd = -1;
uint64_t meta_offset, meta_device_sect, meta_device_size, meta_len;
uint64_t data_offset, data_device_sect, data_device_size, data_len;
uint64_t journal_offset, journal_device_sect, journal_device_size, journal_len;
uint32_t block_order;
uint64_t block_count;
uint32_t clean_entry_bitmap_size = 0, clean_entry_size = 0;
void parse_config(std::map<std::string, std::string> & config);
void open_data();
void open_meta();
void open_journal();
void calc_lengths();
void close_all();
};

View File

@ -15,11 +15,11 @@ journal_flusher_t::journal_flusher_t(blockstore_impl_t *bs)
active_flushers = 0;
syncing_flushers = 0;
// FIXME: allow to configure flusher_start_threshold and journal_trim_interval
flusher_start_threshold = bs->journal_block_size / sizeof(journal_entry_stable);
flusher_start_threshold = bs->dsk.journal_block_size / sizeof(journal_entry_stable);
journal_trim_interval = 512;
journal_trim_counter = bs->journal.flush_journal ? 1 : 0;
trim_wanted = bs->journal.flush_journal ? 1 : 0;
journal_superblock = bs->journal.inmemory ? bs->journal.buffer : memalign_or_die(MEM_ALIGNMENT, bs->journal_block_size);
journal_superblock = bs->journal.inmemory ? bs->journal.buffer : memalign_or_die(MEM_ALIGNMENT, bs->dsk.journal_block_size);
co = new journal_flusher_co[max_flusher_count];
for (int i = 0; i < max_flusher_count; i++)
{
@ -486,28 +486,28 @@ resume_1:
bs->ringloop->wakeup();
}
// Reads completed, submit writes and set bitmap bits
if (bs->clean_entry_bitmap_size)
if (bs->dsk.clean_entry_bitmap_size)
{
new_clean_bitmap = (bs->inmemory_meta
? (uint8_t*)meta_new.buf + meta_new.pos*bs->clean_entry_size + sizeof(clean_disk_entry)
: (uint8_t*)bs->clean_bitmap + (clean_loc >> bs->block_order)*(2*bs->clean_entry_bitmap_size));
? (uint8_t*)meta_new.buf + meta_new.pos*bs->dsk.clean_entry_size + sizeof(clean_disk_entry)
: (uint8_t*)bs->clean_bitmap + (clean_loc >> bs->dsk.block_order)*(2*bs->dsk.clean_entry_bitmap_size));
if (clean_init_bitmap)
{
memset(new_clean_bitmap, 0, bs->clean_entry_bitmap_size);
bitmap_set(new_clean_bitmap, clean_bitmap_offset, clean_bitmap_len, bs->bitmap_granularity);
memset(new_clean_bitmap, 0, bs->dsk.clean_entry_bitmap_size);
bitmap_set(new_clean_bitmap, clean_bitmap_offset, clean_bitmap_len, bs->dsk.bitmap_granularity);
}
}
for (it = v.begin(); it != v.end(); it++)
{
if (new_clean_bitmap)
{
bitmap_set(new_clean_bitmap, it->offset, it->len, bs->bitmap_granularity);
bitmap_set(new_clean_bitmap, it->offset, it->len, bs->dsk.bitmap_granularity);
}
await_sqe(4);
data->iov = (struct iovec){ it->buf, (size_t)it->len };
data->callback = simple_callback_w;
my_uring_prep_writev(
sqe, bs->data_fd, &data->iov, 1, bs->data_offset + clean_loc + it->offset
sqe, bs->dsk.data_fd, &data->iov, 1, bs->dsk.data_offset + clean_loc + it->offset
);
wait_count++;
}
@ -536,35 +536,35 @@ resume_1:
return false;
}
// zero out old metadata entry
memset((uint8_t*)meta_old.buf + meta_old.pos*bs->clean_entry_size, 0, bs->clean_entry_size);
memset((uint8_t*)meta_old.buf + meta_old.pos*bs->dsk.clean_entry_size, 0, bs->dsk.clean_entry_size);
await_sqe(15);
data->iov = (struct iovec){ meta_old.buf, bs->meta_block_size };
data->iov = (struct iovec){ meta_old.buf, bs->dsk.meta_block_size };
data->callback = simple_callback_w;
my_uring_prep_writev(
sqe, bs->meta_fd, &data->iov, 1, bs->meta_offset + meta_old.sector
sqe, bs->dsk.meta_fd, &data->iov, 1, bs->dsk.meta_offset + meta_old.sector
);
wait_count++;
}
if (has_delete)
{
clean_disk_entry *new_entry = (clean_disk_entry*)((uint8_t*)meta_new.buf + meta_new.pos*bs->clean_entry_size);
clean_disk_entry *new_entry = (clean_disk_entry*)((uint8_t*)meta_new.buf + meta_new.pos*bs->dsk.clean_entry_size);
if (new_entry->oid.inode != 0 && new_entry->oid != cur.oid)
{
printf("Fatal error (metadata corruption or bug): tried to delete metadata entry %lu (%lx:%lx v%lu) while deleting %lx:%lx\n",
clean_loc >> bs->block_order, new_entry->oid.inode, new_entry->oid.stripe,
clean_loc >> bs->dsk.block_order, new_entry->oid.inode, new_entry->oid.stripe,
new_entry->version, cur.oid.inode, cur.oid.stripe);
exit(1);
}
// zero out new metadata entry
memset((uint8_t*)meta_new.buf + meta_new.pos*bs->clean_entry_size, 0, bs->clean_entry_size);
memset((uint8_t*)meta_new.buf + meta_new.pos*bs->dsk.clean_entry_size, 0, bs->dsk.clean_entry_size);
}
else
{
clean_disk_entry *new_entry = (clean_disk_entry*)((uint8_t*)meta_new.buf + meta_new.pos*bs->clean_entry_size);
clean_disk_entry *new_entry = (clean_disk_entry*)((uint8_t*)meta_new.buf + meta_new.pos*bs->dsk.clean_entry_size);
if (new_entry->oid.inode != 0 && new_entry->oid != cur.oid)
{
printf("Fatal error (metadata corruption or bug): tried to overwrite non-zero metadata entry %lu (%lx:%lx v%lu) with %lx:%lx v%lu\n",
clean_loc >> bs->block_order, new_entry->oid.inode, new_entry->oid.stripe, new_entry->version,
clean_loc >> bs->dsk.block_order, new_entry->oid.inode, new_entry->oid.stripe, new_entry->version,
cur.oid.inode, cur.oid.stripe, cur.version);
exit(1);
}
@ -572,20 +572,20 @@ resume_1:
new_entry->version = cur.version;
if (!bs->inmemory_meta)
{
memcpy(&new_entry->bitmap, new_clean_bitmap, bs->clean_entry_bitmap_size);
memcpy(&new_entry->bitmap, new_clean_bitmap, bs->dsk.clean_entry_bitmap_size);
}
// copy latest external bitmap/attributes
if (bs->clean_entry_bitmap_size)
if (bs->dsk.clean_entry_bitmap_size)
{
void *bmp_ptr = bs->clean_entry_bitmap_size > sizeof(void*) ? dirty_end->second.bitmap : &dirty_end->second.bitmap;
memcpy((uint8_t*)(new_entry+1) + bs->clean_entry_bitmap_size, bmp_ptr, bs->clean_entry_bitmap_size);
void *bmp_ptr = bs->dsk.clean_entry_bitmap_size > sizeof(void*) ? dirty_end->second.bitmap : &dirty_end->second.bitmap;
memcpy((uint8_t*)(new_entry+1) + bs->dsk.clean_entry_bitmap_size, bmp_ptr, bs->dsk.clean_entry_bitmap_size);
}
}
await_sqe(6);
data->iov = (struct iovec){ meta_new.buf, bs->meta_block_size };
data->iov = (struct iovec){ meta_new.buf, bs->dsk.meta_block_size };
data->callback = simple_callback_w;
my_uring_prep_writev(
sqe, bs->meta_fd, &data->iov, 1, bs->meta_offset + meta_new.sector
sqe, bs->dsk.meta_fd, &data->iov, 1, bs->dsk.meta_offset + meta_new.sector
);
wait_count++;
resume_7:
@ -669,9 +669,9 @@ resume_1:
.version = JOURNAL_VERSION,
};
((journal_entry_start*)flusher->journal_superblock)->crc32 = je_crc32((journal_entry*)flusher->journal_superblock);
data->iov = (struct iovec){ flusher->journal_superblock, bs->journal_block_size };
data->iov = (struct iovec){ flusher->journal_superblock, bs->dsk.journal_block_size };
data->callback = simple_callback_w;
my_uring_prep_writev(sqe, bs->journal.fd, &data->iov, 1, bs->journal.offset);
my_uring_prep_writev(sqe, bs->dsk.journal_fd, &data->iov, 1, bs->journal.offset);
wait_count++;
resume_13:
if (wait_count > 0)
@ -682,7 +682,7 @@ resume_1:
if (!bs->disable_journal_fsync)
{
await_sqe(20);
my_uring_prep_fsync(sqe, bs->journal.fd, IORING_FSYNC_DATASYNC);
my_uring_prep_fsync(sqe, bs->dsk.journal_fd, IORING_FSYNC_DATASYNC);
data->iov = { 0 };
data->callback = simple_callback_w;
resume_21:
@ -774,7 +774,7 @@ bool journal_flusher_co::scan_dirty(int wait_base)
data->iov = (struct iovec){ it->buf, (size_t)submit_len };
data->callback = simple_callback_r;
my_uring_prep_readv(
sqe, bs->journal.fd, &data->iov, 1, bs->journal.offset + submit_offset
sqe, bs->dsk.journal_fd, &data->iov, 1, bs->journal.offset + submit_offset
);
wait_count++;
}
@ -825,8 +825,8 @@ bool journal_flusher_co::modify_meta_read(uint64_t meta_loc, flusher_meta_write_
// And yet another option is to use LSM trees for metadata, but it sophisticates everything a lot,
// so I'll avoid it as long as I can.
wr.submitted = false;
wr.sector = ((meta_loc >> bs->block_order) / (bs->meta_block_size / bs->clean_entry_size)) * bs->meta_block_size;
wr.pos = ((meta_loc >> bs->block_order) % (bs->meta_block_size / bs->clean_entry_size));
wr.sector = ((meta_loc >> bs->dsk.block_order) / (bs->dsk.meta_block_size / bs->dsk.clean_entry_size)) * bs->dsk.meta_block_size;
wr.pos = ((meta_loc >> bs->dsk.block_order) % (bs->dsk.meta_block_size / bs->dsk.clean_entry_size));
if (bs->inmemory_meta)
{
wr.buf = (uint8_t*)bs->metadata_buffer + wr.sector;
@ -836,20 +836,20 @@ bool journal_flusher_co::modify_meta_read(uint64_t meta_loc, flusher_meta_write_
if (wr.it == flusher->meta_sectors.end())
{
// Not in memory yet, read it
wr.buf = memalign_or_die(MEM_ALIGNMENT, bs->meta_block_size);
wr.buf = memalign_or_die(MEM_ALIGNMENT, bs->dsk.meta_block_size);
wr.it = flusher->meta_sectors.emplace(wr.sector, (meta_sector_t){
.offset = wr.sector,
.len = bs->meta_block_size,
.len = bs->dsk.meta_block_size,
.state = 0, // 0 = not read yet
.buf = wr.buf,
.usage_count = 1,
}).first;
await_sqe(0);
data->iov = (struct iovec){ wr.it->second.buf, bs->meta_block_size };
data->iov = (struct iovec){ wr.it->second.buf, bs->dsk.meta_block_size };
data->callback = simple_callback_r;
wr.submitted = true;
my_uring_prep_readv(
sqe, bs->meta_fd, &data->iov, 1, bs->meta_offset + wr.sector
sqe, bs->dsk.meta_fd, &data->iov, 1, bs->dsk.meta_offset + wr.sector
);
wait_count++;
}
@ -867,11 +867,11 @@ void journal_flusher_co::update_clean_db()
{
#ifdef BLOCKSTORE_DEBUG
printf("Free block %lu from %lx:%lx v%lu (new location is %lu)\n",
old_clean_loc >> bs->block_order,
old_clean_loc >> bs->dsk.block_order,
cur.oid.inode, cur.oid.stripe, cur.version,
clean_loc >> bs->block_order);
clean_loc >> bs->dsk.block_order);
#endif
bs->data_alloc->set(old_clean_loc >> bs->block_order, false);
bs->data_alloc->set(old_clean_loc >> bs->dsk.block_order, false);
}
auto & clean_db = bs->clean_db_shard(cur.oid);
if (has_delete)
@ -880,10 +880,10 @@ void journal_flusher_co::update_clean_db()
clean_db.erase(clean_it);
#ifdef BLOCKSTORE_DEBUG
printf("Free block %lu from %lx:%lx v%lu (delete)\n",
clean_loc >> bs->block_order,
clean_loc >> bs->dsk.block_order,
cur.oid.inode, cur.oid.stripe, cur.version);
#endif
bs->data_alloc->set(clean_loc >> bs->block_order, false);
bs->data_alloc->set(clean_loc >> bs->dsk.block_order, false);
clean_loc = UINT64_MAX;
}
else
@ -932,7 +932,7 @@ bool journal_flusher_co::fsync_batch(bool fsync_meta, int wait_base)
await_sqe(0);
data->iov = { 0 };
data->callback = simple_callback_w;
my_uring_prep_fsync(sqe, fsync_meta ? bs->meta_fd : bs->data_fd, IORING_FSYNC_DATASYNC);
my_uring_prep_fsync(sqe, fsync_meta ? bs->dsk.meta_fd : bs->dsk.data_fd, IORING_FSYNC_DATASYNC);
cur_sync->state = 1;
wait_count++;
resume_2:

View File

@ -11,25 +11,19 @@ blockstore_impl_t::blockstore_impl_t(blockstore_config_t & config, ring_loop_t *
ring_consumer.loop = [this]() { loop(); };
ringloop->register_consumer(&ring_consumer);
initialized = 0;
data_fd = meta_fd = journal.fd = -1;
parse_config(config);
zero_object = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, data_block_size);
zero_object = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, dsk.data_block_size);
try
{
open_data();
open_meta();
open_journal();
dsk.open_data();
dsk.open_meta();
dsk.open_journal();
calc_lengths();
data_alloc = new allocator(block_count);
data_alloc = new allocator(dsk.block_count);
}
catch (std::exception & e)
{
if (data_fd >= 0)
close(data_fd);
if (meta_fd >= 0 && meta_fd != data_fd)
close(meta_fd);
if (journal.fd >= 0 && journal.fd != meta_fd)
close(journal.fd);
dsk.close_all();
throw;
}
flusher = new journal_flusher_t(this);
@ -41,12 +35,7 @@ blockstore_impl_t::~blockstore_impl_t()
delete flusher;
free(zero_object);
ringloop->unregister_consumer(&ring_consumer);
if (data_fd >= 0)
close(data_fd);
if (meta_fd >= 0 && meta_fd != data_fd)
close(meta_fd);
if (journal.fd >= 0 && journal.fd != meta_fd)
close(journal.fd);
dsk.close_all();
if (metadata_buffer)
free(metadata_buffer);
if (clean_bitmap)
@ -343,9 +332,9 @@ void blockstore_impl_t::enqueue_op(blockstore_op_t *op)
{
if (op->opcode < BS_OP_MIN || op->opcode > BS_OP_MAX ||
((op->opcode == BS_OP_READ || op->opcode == BS_OP_WRITE || op->opcode == BS_OP_WRITE_STABLE) && (
op->offset >= data_block_size ||
op->len > data_block_size-op->offset ||
(op->len % disk_alignment)
op->offset >= dsk.data_block_size ||
op->len > dsk.data_block_size-op->offset ||
(op->len % dsk.disk_alignment)
)) ||
readonly && op->opcode != BS_OP_READ && op->opcode != BS_OP_LIST)
{

View File

@ -4,6 +4,7 @@
#pragma once
#include "blockstore.h"
#include "blockstore_disk.h"
#include <sys/types.h>
#include <sys/ioctl.h>
@ -218,23 +219,10 @@ struct pool_shard_settings_t
class blockstore_impl_t
{
blockstore_disk_t dsk;
/******* OPTIONS *******/
std::string data_device, meta_device, journal_device;
uint32_t data_block_size;
uint64_t meta_offset;
uint64_t data_offset;
uint64_t cfg_journal_size, cfg_data_size;
// Required write alignment and journal/metadata/data areas' location alignment
uint32_t disk_alignment = 4096;
// Journal block size - minimum_io_size of the journal device is the best choice
uint64_t journal_block_size = 4096;
// Metadata block size - minimum_io_size of the metadata device is the best choice
uint64_t meta_block_size = 4096;
// Sparse write tracking granularity. 4 KB is a good choice. Must be a multiple of disk_alignment
uint64_t bitmap_granularity = 4096;
bool readonly = false;
// By default, Blockstore locks all opened devices exclusively. This option can be used to disable locking
bool disable_flock = false;
// It is safe to disable fsync() if drive write cache is writethrough
bool disable_data_fsync = false, disable_meta_fsync = false, disable_journal_fsync = false;
// Enable if you want every operation to be executed with an "implicit fsync"
@ -269,16 +257,6 @@ class blockstore_impl_t
allocator *data_alloc = NULL;
uint8_t *zero_object;
uint32_t block_order;
uint64_t block_count;
uint32_t clean_entry_bitmap_size = 0, clean_entry_size = 0;
int meta_fd;
int data_fd;
uint64_t meta_device_size, meta_len;
uint64_t data_device_size, data_len;
uint64_t data_device_sect, meta_device_sect, journal_device_sect;
void *metadata_buffer = NULL;
struct journal_t journal;
@ -395,9 +373,9 @@ public:
// Print diagnostics to stdout
void dump_diagnostics();
inline uint32_t get_block_size() { return data_block_size; }
inline uint64_t get_block_count() { return block_count; }
inline uint32_t get_block_size() { return dsk.data_block_size; }
inline uint64_t get_block_count() { return dsk.block_count; }
inline uint64_t get_free_block_count() { return data_alloc->get_free_count(); }
inline uint32_t get_bitmap_granularity() { return disk_alignment; }
inline uint64_t get_journal_size() { return journal.len; }
inline uint32_t get_bitmap_granularity() { return dsk.disk_alignment; }
inline uint64_t get_journal_size() { return dsk.journal_len; }
};

View File

@ -57,9 +57,9 @@ int blockstore_init_meta::loop()
throw std::runtime_error("Failed to allocate metadata read buffer");
// Read superblock
GET_SQE();
data->iov = { metadata_buffer, bs->meta_block_size };
data->iov = { metadata_buffer, bs->dsk.meta_block_size };
data->callback = [this](ring_data_t *data) { handle_event(data); };
my_uring_prep_readv(sqe, bs->meta_fd, &data->iov, 1, bs->meta_offset);
my_uring_prep_readv(sqe, bs->dsk.meta_fd, &data->iov, 1, bs->dsk.meta_offset);
bs->ringloop->submit();
submitted = 1;
resume_1:
@ -68,16 +68,16 @@ resume_1:
wait_state = 1;
return 1;
}
if (iszero((uint64_t*)metadata_buffer, bs->meta_block_size / sizeof(uint64_t)))
if (iszero((uint64_t*)metadata_buffer, bs->dsk.meta_block_size / sizeof(uint64_t)))
{
{
blockstore_meta_header_v1_t *hdr = (blockstore_meta_header_v1_t *)metadata_buffer;
hdr->zero = 0;
hdr->magic = BLOCKSTORE_META_MAGIC_V1;
hdr->version = BLOCKSTORE_META_VERSION_V1;
hdr->meta_block_size = bs->meta_block_size;
hdr->data_block_size = bs->data_block_size;
hdr->bitmap_granularity = bs->bitmap_granularity;
hdr->meta_block_size = bs->dsk.meta_block_size;
hdr->data_block_size = bs->dsk.data_block_size;
hdr->bitmap_granularity = bs->dsk.bitmap_granularity;
}
if (bs->readonly)
{
@ -87,9 +87,9 @@ resume_1:
{
printf("Initializing metadata area\n");
GET_SQE();
data->iov = (struct iovec){ metadata_buffer, bs->meta_block_size };
data->iov = (struct iovec){ metadata_buffer, bs->dsk.meta_block_size };
data->callback = [this](ring_data_t *data) { handle_event(data); };
my_uring_prep_writev(sqe, bs->meta_fd, &data->iov, 1, bs->meta_offset);
my_uring_prep_writev(sqe, bs->dsk.meta_fd, &data->iov, 1, bs->dsk.meta_offset);
bs->ringloop->submit();
submitted = 1;
resume_3:
@ -115,23 +115,23 @@ resume_1:
);
exit(1);
}
if (hdr->meta_block_size != bs->meta_block_size ||
hdr->data_block_size != bs->data_block_size ||
hdr->bitmap_granularity != bs->bitmap_granularity)
if (hdr->meta_block_size != bs->dsk.meta_block_size ||
hdr->data_block_size != bs->dsk.data_block_size ||
hdr->bitmap_granularity != bs->dsk.bitmap_granularity)
{
printf(
"Configuration stored in metadata superblock"
" (meta_block_size=%u, data_block_size=%u, bitmap_granularity=%u)"
" differs from OSD configuration (%lu/%u/%lu).\n",
hdr->meta_block_size, hdr->data_block_size, hdr->bitmap_granularity,
bs->meta_block_size, bs->data_block_size, bs->bitmap_granularity
bs->dsk.meta_block_size, bs->dsk.data_block_size, bs->dsk.bitmap_granularity
);
exit(1);
}
}
// Skip superblock
bs->meta_offset += bs->meta_block_size;
bs->meta_len -= bs->meta_block_size;
bs->dsk.meta_offset += bs->dsk.meta_block_size;
bs->dsk.meta_len -= bs->dsk.meta_block_size;
prev_done = 0;
done_len = 0;
done_pos = 0;
@ -145,23 +145,23 @@ resume_1:
wait_state = 2;
return 1;
}
if (metadata_read < bs->meta_len)
if (metadata_read < bs->dsk.meta_len)
{
GET_SQE();
data->iov = {
(uint8_t*)metadata_buffer + (bs->inmemory_meta
? metadata_read
: (prev == 1 ? bs->metadata_buf_size : 0)),
bs->meta_len - metadata_read > bs->metadata_buf_size ? bs->metadata_buf_size : bs->meta_len - metadata_read,
bs->dsk.meta_len - metadata_read > bs->metadata_buf_size ? bs->metadata_buf_size : bs->dsk.meta_len - metadata_read,
};
data->callback = [this](ring_data_t *data) { handle_event(data); };
if (!zero_on_init)
my_uring_prep_readv(sqe, bs->meta_fd, &data->iov, 1, bs->meta_offset + metadata_read);
my_uring_prep_readv(sqe, bs->dsk.meta_fd, &data->iov, 1, bs->dsk.meta_offset + metadata_read);
else
{
// Fill metadata with zeroes
memset(data->iov.iov_base, 0, data->iov.iov_len);
my_uring_prep_writev(sqe, bs->meta_fd, &data->iov, 1, bs->meta_offset + metadata_read);
my_uring_prep_writev(sqe, bs->dsk.meta_fd, &data->iov, 1, bs->dsk.meta_offset + metadata_read);
}
bs->ringloop->submit();
submitted = (prev == 1 ? 2 : 1);
@ -172,11 +172,11 @@ resume_1:
void *done_buf = bs->inmemory_meta
? ((uint8_t*)metadata_buffer + done_pos)
: ((uint8_t*)metadata_buffer + (prev_done == 2 ? bs->metadata_buf_size : 0));
unsigned count = bs->meta_block_size / bs->clean_entry_size;
for (int sector = 0; sector < done_len; sector += bs->meta_block_size)
unsigned count = bs->dsk.meta_block_size / bs->dsk.clean_entry_size;
for (int sector = 0; sector < done_len; sector += bs->dsk.meta_block_size)
{
// handle <count> entries
handle_entries((uint8_t*)done_buf + sector, count, bs->block_order);
handle_entries((uint8_t*)done_buf + sector, count, bs->dsk.block_order);
done_cnt += count;
}
prev_done = 0;
@ -188,7 +188,7 @@ resume_1:
}
}
// metadata read finished
printf("Metadata entries loaded: %lu, free blocks: %lu / %lu\n", entries_loaded, bs->data_alloc->get_free_count(), bs->block_count);
printf("Metadata entries loaded: %lu, free blocks: %lu / %lu\n", entries_loaded, bs->data_alloc->get_free_count(), bs->dsk.block_count);
if (!bs->inmemory_meta)
{
free(metadata_buffer);
@ -197,7 +197,7 @@ resume_1:
if (zero_on_init && !bs->disable_meta_fsync)
{
GET_SQE();
my_uring_prep_fsync(sqe, bs->meta_fd, IORING_FSYNC_DATASYNC);
my_uring_prep_fsync(sqe, bs->dsk.meta_fd, IORING_FSYNC_DATASYNC);
data->iov = { 0 };
data->callback = [this](ring_data_t *data) { handle_event(data); };
submitted = 1;
@ -216,10 +216,10 @@ void blockstore_init_meta::handle_entries(void* entries, unsigned count, int blo
{
for (unsigned i = 0; i < count; i++)
{
clean_disk_entry *entry = (clean_disk_entry*)((uint8_t*)entries + i*bs->clean_entry_size);
if (!bs->inmemory_meta && bs->clean_entry_bitmap_size)
clean_disk_entry *entry = (clean_disk_entry*)((uint8_t*)entries + i*bs->dsk.clean_entry_size);
if (!bs->inmemory_meta && bs->dsk.clean_entry_bitmap_size)
{
memcpy(bs->clean_bitmap + (done_cnt+i)*2*bs->clean_entry_bitmap_size, &entry->bitmap, 2*bs->clean_entry_bitmap_size);
memcpy(bs->clean_bitmap + (done_cnt+i)*2*bs->dsk.clean_entry_bitmap_size, &entry->bitmap, 2*bs->dsk.clean_entry_bitmap_size);
}
if (entry->oid.inode > 0)
{
@ -240,7 +240,7 @@ void blockstore_init_meta::handle_entries(void* entries, unsigned count, int blo
}
else
{
bs->inode_space_stats[entry->oid.inode] += bs->data_block_size;
bs->inode_space_stats[entry->oid.inode] += bs->dsk.data_block_size;
}
entries_loaded++;
#ifdef BLOCKSTORE_DEBUG
@ -328,7 +328,7 @@ int blockstore_init_journal::loop()
data = ((ring_data_t*)sqe->user_data);
data->iov = { submitted_buf, bs->journal.block_size };
data->callback = simple_callback;
my_uring_prep_readv(sqe, bs->journal.fd, &data->iov, 1, bs->journal.offset);
my_uring_prep_readv(sqe, bs->dsk.journal_fd, &data->iov, 1, bs->journal.offset);
bs->ringloop->submit();
wait_count = 1;
resume_1:
@ -367,7 +367,7 @@ resume_1:
GET_SQE();
data->iov = (struct iovec){ submitted_buf, 2*bs->journal.block_size };
data->callback = simple_callback;
my_uring_prep_writev(sqe, bs->journal.fd, &data->iov, 1, bs->journal.offset);
my_uring_prep_writev(sqe, bs->dsk.journal_fd, &data->iov, 1, bs->journal.offset);
wait_count++;
bs->ringloop->submit();
resume_6:
@ -379,7 +379,7 @@ resume_1:
if (!bs->disable_journal_fsync)
{
GET_SQE();
my_uring_prep_fsync(sqe, bs->journal.fd, IORING_FSYNC_DATASYNC);
my_uring_prep_fsync(sqe, bs->dsk.journal_fd, IORING_FSYNC_DATASYNC);
data->iov = { 0 };
data->callback = simple_callback;
wait_count++;
@ -448,7 +448,7 @@ resume_1:
end - journal_pos < JOURNAL_BUFFER_SIZE ? end - journal_pos : JOURNAL_BUFFER_SIZE,
};
data->callback = [this](ring_data_t *data1) { handle_event(data1); };
my_uring_prep_readv(sqe, bs->journal.fd, &data->iov, 1, bs->journal.offset + journal_pos);
my_uring_prep_readv(sqe, bs->dsk.journal_fd, &data->iov, 1, bs->journal.offset + journal_pos);
bs->ringloop->submit();
}
while (done.size() > 0)
@ -463,7 +463,7 @@ resume_1:
GET_SQE();
data->iov = { init_write_buf, bs->journal.block_size };
data->callback = simple_callback;
my_uring_prep_writev(sqe, bs->journal.fd, &data->iov, 1, bs->journal.offset + init_write_sector);
my_uring_prep_writev(sqe, bs->dsk.journal_fd, &data->iov, 1, bs->journal.offset + init_write_sector);
wait_count++;
bs->ringloop->submit();
resume_7:
@ -477,7 +477,7 @@ resume_1:
GET_SQE();
data->iov = { 0 };
data->callback = simple_callback;
my_uring_prep_fsync(sqe, bs->journal.fd, IORING_FSYNC_DATASYNC);
my_uring_prep_fsync(sqe, bs->dsk.journal_fd, IORING_FSYNC_DATASYNC);
wait_count++;
bs->ringloop->submit();
}
@ -544,7 +544,7 @@ resume_1:
? bs->journal.len-bs->journal.block_size - (bs->journal.next_free-bs->journal.used_start)
: bs->journal.used_start - bs->journal.next_free),
bs->journal.used_start, bs->journal.next_free,
bs->data_alloc->get_free_count(), bs->block_count
bs->data_alloc->get_free_count(), bs->dsk.block_count
);
bs->journal.crc32_last = crc32_last;
return 0;
@ -669,9 +669,9 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
};
void *bmp = NULL;
void *bmp_from = (uint8_t*)je + sizeof(journal_entry_small_write);
if (bs->clean_entry_bitmap_size <= sizeof(void*))
if (bs->dsk.clean_entry_bitmap_size <= sizeof(void*))
{
memcpy(&bmp, bmp_from, bs->clean_entry_bitmap_size);
memcpy(&bmp, bmp_from, bs->dsk.clean_entry_bitmap_size);
}
else
{
@ -679,8 +679,8 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
// allocations for entry bitmaps. This can only be fixed by using
// a patched map with dynamic entry size, but not the btree_map,
// because it doesn't keep iterators valid all the time.
bmp = malloc_or_die(bs->clean_entry_bitmap_size);
memcpy(bmp, bmp_from, bs->clean_entry_bitmap_size);
bmp = malloc_or_die(bs->dsk.clean_entry_bitmap_size);
memcpy(bmp, bmp_from, bs->dsk.clean_entry_bitmap_size);
}
bs->dirty_db.emplace(ov, (dirty_entry){
.state = (BS_ST_SMALL_WRITE | BS_ST_SYNCED),
@ -712,7 +712,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
printf(
"je_big_write%s oid=%lx:%lx ver=%lu loc=%lu\n",
je->type == JE_BIG_WRITE_INSTANT ? "_instant" : "",
je->big_write.oid.inode, je->big_write.oid.stripe, je->big_write.version, je->big_write.location >> bs->block_order
je->big_write.oid.inode, je->big_write.oid.stripe, je->big_write.version, je->big_write.location >> bs->dsk.block_order
);
#endif
auto dirty_it = bs->dirty_db.upper_bound((obj_ver_id){
@ -750,9 +750,9 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
};
void *bmp = NULL;
void *bmp_from = (uint8_t*)je + sizeof(journal_entry_big_write);
if (bs->clean_entry_bitmap_size <= sizeof(void*))
if (bs->dsk.clean_entry_bitmap_size <= sizeof(void*))
{
memcpy(&bmp, bmp_from, bs->clean_entry_bitmap_size);
memcpy(&bmp, bmp_from, bs->dsk.clean_entry_bitmap_size);
}
else
{
@ -760,8 +760,8 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
// allocations for entry bitmaps. This can only be fixed by using
// a patched map with dynamic entry size, but not the btree_map,
// because it doesn't keep iterators valid all the time.
bmp = malloc_or_die(bs->clean_entry_bitmap_size);
memcpy(bmp, bmp_from, bs->clean_entry_bitmap_size);
bmp = malloc_or_die(bs->dsk.clean_entry_bitmap_size);
memcpy(bmp, bmp_from, bs->dsk.clean_entry_bitmap_size);
}
auto dirty_it = bs->dirty_db.emplace(ov, (dirty_entry){
.state = (BS_ST_BIG_WRITE | BS_ST_SYNCED),
@ -772,7 +772,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
.journal_sector = proc_pos,
.bitmap = bmp,
}).first;
if (bs->data_alloc->get(je->big_write.location >> bs->block_order))
if (bs->data_alloc->get(je->big_write.location >> bs->dsk.block_order))
{
// This is probably a big_write that's already flushed and freed, but it may
// also indicate a bug. So we remember such entries and recheck them afterwards.
@ -785,11 +785,11 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
#ifdef BLOCKSTORE_DEBUG
printf(
"Allocate block (journal) %lu: %lx:%lx v%lu\n",
je->big_write.location >> bs->block_order,
je->big_write.location >> bs->dsk.block_order,
ov.oid.inode, ov.oid.stripe, ov.version
);
#endif
bs->data_alloc->set(je->big_write.location >> bs->block_order, true);
bs->data_alloc->set(je->big_write.location >> bs->dsk.block_order, true);
}
bs->journal.used_sectors[proc_pos]++;
#ifdef BLOCKSTORE_DEBUG
@ -913,8 +913,8 @@ void blockstore_init_journal::erase_dirty_object(blockstore_dirty_db_t::iterator
if (exists && clean_loc == UINT64_MAX)
{
auto & sp = bs->inode_space_stats[oid.inode];
if (sp > bs->data_block_size)
sp -= bs->data_block_size;
if (sp > bs->dsk.data_block_size)
sp -= bs->dsk.data_block_size;
else
bs->inode_space_stats.erase(oid.inode);
}

View File

@ -175,7 +175,7 @@ void blockstore_impl_t::prepare_journal_sector_write(int cur_sector, blockstore_
};
data->callback = [this, flush_id = journal.submit_id](ring_data_t *data) { handle_journal_write(data, flush_id); };
my_uring_prep_writev(
sqe, journal.fd, &data->iov, 1, journal.offset + journal.sector_info[cur_sector].offset
sqe, dsk.journal_fd, &data->iov, 1, journal.offset + journal.sector_info[cur_sector].offset
);
}
journal.sector_info[cur_sector].dirty = false;

View File

@ -164,7 +164,6 @@ inline bool operator < (const pending_journaling_t & a, const pending_journaling
struct journal_t
{
int fd;
uint64_t device_size;
bool inmemory = false;
bool flush_journal = false;
void *buffer = NULL;

View File

@ -4,23 +4,10 @@
#include <sys/file.h>
#include "blockstore_impl.h"
static uint32_t is_power_of_two(uint64_t value)
{
uint32_t l = 0;
while (value > 1)
{
if (value & 1)
{
return 64;
}
value = value >> 1;
l++;
}
return l;
}
void blockstore_impl_t::parse_config(blockstore_config_t & config)
{
// Common disk options
dsk.parse_config(config);
// Parse
if (config["readonly"] == "true" || config["readonly"] == "1" || config["readonly"] == "yes")
{
@ -38,10 +25,6 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config)
{
disable_journal_fsync = true;
}
if (config["disable_device_lock"] == "true" || config["disable_device_lock"] == "1" || config["disable_device_lock"] == "yes")
{
disable_flock = true;
}
if (config["flush_journal"] == "true" || config["flush_journal"] == "1" || config["flush_journal"] == "yes")
{
// Only flush journal and exit
@ -56,24 +39,11 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config)
immediate_commit = IMMEDIATE_SMALL;
}
metadata_buf_size = strtoull(config["meta_buf_size"].c_str(), NULL, 10);
cfg_journal_size = strtoull(config["journal_size"].c_str(), NULL, 10);
data_device = config["data_device"];
data_offset = strtoull(config["data_offset"].c_str(), NULL, 10);
cfg_data_size = strtoull(config["data_size"].c_str(), NULL, 10);
meta_device = config["meta_device"];
meta_offset = strtoull(config["meta_offset"].c_str(), NULL, 10);
data_block_size = strtoull(config["block_size"].c_str(), NULL, 10);
inmemory_meta = config["inmemory_metadata"] != "false";
journal_device = config["journal_device"];
journal.offset = strtoull(config["journal_offset"].c_str(), NULL, 10);
journal.sector_count = strtoull(config["journal_sector_buffer_count"].c_str(), NULL, 10);
journal.no_same_sector_overwrites = config["journal_no_same_sector_overwrites"] == "true" ||
config["journal_no_same_sector_overwrites"] == "1" || config["journal_no_same_sector_overwrites"] == "yes";
journal.inmemory = config["inmemory_journal"] != "false";
disk_alignment = strtoull(config["disk_alignment"].c_str(), NULL, 10);
journal_block_size = strtoull(config["journal_block_size"].c_str(), NULL, 10);
meta_block_size = strtoull(config["meta_block_size"].c_str(), NULL, 10);
bitmap_granularity = strtoull(config["bitmap_granularity"].c_str(), NULL, 10);
max_flusher_count = strtoull(config["max_flusher_count"].c_str(), NULL, 10);
if (!max_flusher_count)
max_flusher_count = strtoull(config["flusher_count"].c_str(), NULL, 10);
@ -85,14 +55,6 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config)
throttle_target_parallelism = strtoull(config["throttle_target_parallelism"].c_str(), NULL, 10);
throttle_threshold_us = strtoull(config["throttle_threshold_us"].c_str(), NULL, 10);
// Validate
if (!data_block_size)
{
data_block_size = (1 << DEFAULT_DATA_BLOCK_ORDER);
}
if ((block_order = is_power_of_two(data_block_size)) >= 64 || data_block_size < MIN_DATA_BLOCK_SIZE || data_block_size >= MAX_DATA_BLOCK_SIZE)
{
throw std::runtime_error("Bad block size");
}
if (!max_flusher_count)
{
max_flusher_count = 256;
@ -105,62 +67,6 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config)
{
max_write_iodepth = 128;
}
if (!disk_alignment)
{
disk_alignment = 4096;
}
else if (disk_alignment % DIRECT_IO_ALIGNMENT)
{
throw std::runtime_error("disk_alignment must be a multiple of "+std::to_string(DIRECT_IO_ALIGNMENT));
}
if (!journal_block_size)
{
journal_block_size = 4096;
}
else if (journal_block_size % DIRECT_IO_ALIGNMENT)
{
throw std::runtime_error("journal_block_size must be a multiple of "+std::to_string(DIRECT_IO_ALIGNMENT));
}
if (!meta_block_size)
{
meta_block_size = 4096;
}
else if (meta_block_size % DIRECT_IO_ALIGNMENT)
{
throw std::runtime_error("meta_block_size must be a multiple of "+std::to_string(DIRECT_IO_ALIGNMENT));
}
if (data_offset % disk_alignment)
{
throw std::runtime_error("data_offset must be a multiple of disk_alignment = "+std::to_string(disk_alignment));
}
if (!bitmap_granularity)
{
bitmap_granularity = DEFAULT_BITMAP_GRANULARITY;
}
else if (bitmap_granularity % disk_alignment)
{
throw std::runtime_error("Sparse write tracking granularity must be a multiple of disk_alignment = "+std::to_string(disk_alignment));
}
if (data_block_size % bitmap_granularity)
{
throw std::runtime_error("Block size must be a multiple of sparse write tracking granularity");
}
if (journal_device == meta_device || meta_device == "" && journal_device == data_device)
{
journal_device = "";
}
if (meta_device == data_device)
{
meta_device = "";
}
if (meta_offset % meta_block_size)
{
throw std::runtime_error("meta_offset must be a multiple of meta_block_size = "+std::to_string(meta_block_size));
}
if (journal.offset % journal_block_size)
{
throw std::runtime_error("journal_offset must be a multiple of journal_block_size = "+std::to_string(journal_block_size));
}
if (journal.sector_count < 2)
{
journal.sector_count = 32;
@ -169,11 +75,11 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config)
{
metadata_buf_size = 4*1024*1024;
}
if (meta_device == "")
if (dsk.meta_device == "")
{
disable_meta_fsync = disable_data_fsync;
}
if (journal_device == "")
if (dsk.journal_device == "")
{
disable_journal_fsync = disable_meta_fsync;
}
@ -202,238 +108,46 @@ void blockstore_impl_t::parse_config(blockstore_config_t & config)
throttle_threshold_us = 50;
}
// init some fields
clean_entry_bitmap_size = data_block_size / bitmap_granularity / 8;
clean_entry_size = sizeof(clean_disk_entry) + 2*clean_entry_bitmap_size;
journal.block_size = journal_block_size;
journal.next_free = journal_block_size;
journal.used_start = journal_block_size;
journal.block_size = dsk.journal_block_size;
journal.next_free = dsk.journal_block_size;
journal.used_start = dsk.journal_block_size;
// no free space because sector is initially unmapped
journal.in_sector_pos = journal_block_size;
journal.in_sector_pos = dsk.journal_block_size;
}
void blockstore_impl_t::calc_lengths()
{
// data
data_len = data_device_size - data_offset;
if (data_fd == meta_fd && data_offset < meta_offset)
{
data_len = meta_offset - data_offset;
}
if (data_fd == journal.fd && data_offset < journal.offset)
{
data_len = data_len < journal.offset-data_offset
? data_len : journal.offset-data_offset;
}
if (cfg_data_size != 0)
{
if (data_len < cfg_data_size)
{
throw std::runtime_error("Data area ("+std::to_string(data_len)+
" bytes) is less than configured size ("+std::to_string(cfg_data_size)+" bytes)");
}
data_len = cfg_data_size;
}
// meta
uint64_t meta_area_size = (meta_fd == data_fd ? data_device_size : meta_device_size) - meta_offset;
if (meta_fd == data_fd && meta_offset <= data_offset)
{
meta_area_size = data_offset - meta_offset;
}
if (meta_fd == journal.fd && meta_offset <= journal.offset)
{
meta_area_size = meta_area_size < journal.offset-meta_offset
? meta_area_size : journal.offset-meta_offset;
}
// journal
journal.len = (journal.fd == data_fd ? data_device_size : (journal.fd == meta_fd ? meta_device_size : journal.device_size)) - journal.offset;
if (journal.fd == data_fd && journal.offset <= data_offset)
{
journal.len = data_offset - journal.offset;
}
if (journal.fd == meta_fd && journal.offset <= meta_offset)
{
journal.len = journal.len < meta_offset-journal.offset
? journal.len : meta_offset-journal.offset;
}
// required metadata size
block_count = data_len / data_block_size;
meta_len = (1 + (block_count - 1 + meta_block_size / clean_entry_size) / (meta_block_size / clean_entry_size)) * meta_block_size;
if (meta_area_size < meta_len)
{
throw std::runtime_error("Metadata area is too small, need at least "+std::to_string(meta_len)+" bytes");
}
dsk.calc_lengths();
journal.len = dsk.journal_len;
journal.block_size = dsk.journal_block_size;
journal.offset = dsk.journal_offset;
if (inmemory_meta)
{
metadata_buffer = memalign(MEM_ALIGNMENT, meta_len);
metadata_buffer = memalign(MEM_ALIGNMENT, dsk.meta_len);
if (!metadata_buffer)
throw std::runtime_error("Failed to allocate memory for the metadata");
}
else if (clean_entry_bitmap_size)
else if (dsk.clean_entry_bitmap_size)
{
clean_bitmap = (uint8_t*)malloc(block_count * 2*clean_entry_bitmap_size);
clean_bitmap = (uint8_t*)malloc(dsk.block_count * 2*dsk.clean_entry_bitmap_size);
if (!clean_bitmap)
throw std::runtime_error("Failed to allocate memory for the metadata sparse write bitmap");
}
// requested journal size
if (cfg_journal_size > journal.len)
{
throw std::runtime_error("Requested journal_size is too large");
}
else if (cfg_journal_size > 0)
{
journal.len = cfg_journal_size;
}
if (journal.len < MIN_JOURNAL_SIZE)
{
throw std::runtime_error("Journal is too small, need at least "+std::to_string(MIN_JOURNAL_SIZE)+" bytes");
}
if (journal.inmemory)
{
journal.buffer = memalign(MEM_ALIGNMENT, journal.len);
if (!journal.buffer)
throw std::runtime_error("Failed to allocate memory for journal");
}
}
static void check_size(int fd, uint64_t *size, uint64_t *sectsize, std::string name)
{
int sect;
struct stat st;
if (fstat(fd, &st) < 0)
{
throw std::runtime_error("Failed to stat "+name);
}
if (S_ISREG(st.st_mode))
{
*size = st.st_size;
if (sectsize)
{
*sectsize = st.st_blksize;
}
}
else if (S_ISBLK(st.st_mode))
{
if (ioctl(fd, BLKGETSIZE64, size) < 0 ||
ioctl(fd, BLKSSZGET, &sect) < 0)
{
throw std::runtime_error("Failed to get "+name+" size or block size: "+strerror(errno));
}
if (sectsize)
{
*sectsize = sect;
}
}
else
{
throw std::runtime_error(name+" is neither a file nor a block device");
}
}
void blockstore_impl_t::open_data()
{
data_fd = open(data_device.c_str(), O_DIRECT|O_RDWR);
if (data_fd == -1)
{
throw std::runtime_error("Failed to open data device");
}
check_size(data_fd, &data_device_size, &data_device_sect, "data device");
if (disk_alignment % data_device_sect)
{
throw std::runtime_error(
"disk_alignment ("+std::to_string(disk_alignment)+
") is not a multiple of data device sector size ("+std::to_string(data_device_sect)+")"
);
}
if (data_offset >= data_device_size)
{
throw std::runtime_error("data_offset exceeds device size = "+std::to_string(data_device_size));
}
if (!disable_flock && flock(data_fd, LOCK_EX|LOCK_NB) != 0)
{
throw std::runtime_error(std::string("Failed to lock data device: ") + strerror(errno));
}
}
void blockstore_impl_t::open_meta()
{
if (meta_device != "")
{
meta_offset = 0;
meta_fd = open(meta_device.c_str(), O_DIRECT|O_RDWR);
if (meta_fd == -1)
{
throw std::runtime_error("Failed to open metadata device");
}
check_size(meta_fd, &meta_device_size, &meta_device_sect, "metadata device");
if (meta_offset >= meta_device_size)
{
throw std::runtime_error("meta_offset exceeds device size = "+std::to_string(meta_device_size));
}
if (!disable_flock && flock(meta_fd, LOCK_EX|LOCK_NB) != 0)
{
throw std::runtime_error(std::string("Failed to lock metadata device: ") + strerror(errno));
}
}
else
{
meta_fd = data_fd;
meta_device_sect = data_device_sect;
meta_device_size = 0;
if (meta_offset >= data_device_size)
{
throw std::runtime_error("meta_offset exceeds device size = "+std::to_string(data_device_size));
}
}
if (meta_block_size % meta_device_sect)
{
throw std::runtime_error(
"meta_block_size ("+std::to_string(meta_block_size)+
") is not a multiple of data device sector size ("+std::to_string(meta_device_sect)+")"
);
}
}
void blockstore_impl_t::open_journal()
{
if (journal_device != "")
{
journal.fd = open(journal_device.c_str(), O_DIRECT|O_RDWR);
if (journal.fd == -1)
{
throw std::runtime_error("Failed to open journal device");
}
check_size(journal.fd, &journal.device_size, &journal_device_sect, "journal device");
if (!disable_flock && flock(journal.fd, LOCK_EX|LOCK_NB) != 0)
{
throw std::runtime_error(std::string("Failed to lock journal device: ") + strerror(errno));
}
}
else
{
journal.fd = meta_fd;
journal_device_sect = meta_device_sect;
journal.device_size = 0;
if (journal.offset >= data_device_size)
{
throw std::runtime_error("journal_offset exceeds device size");
}
journal.sector_buf = (uint8_t*)memalign(MEM_ALIGNMENT, journal.sector_count * dsk.journal_block_size);
if (!journal.sector_buf)
throw std::bad_alloc();
}
journal.sector_info = (journal_sector_info_t*)calloc(journal.sector_count, sizeof(journal_sector_info_t));
if (!journal.sector_info)
{
throw std::bad_alloc();
}
if (!journal.inmemory)
{
journal.sector_buf = (uint8_t*)memalign(MEM_ALIGNMENT, journal.sector_count * journal_block_size);
if (!journal.sector_buf)
throw std::bad_alloc();
}
if (journal_block_size % journal_device_sect)
{
throw std::runtime_error(
"journal_block_size ("+std::to_string(journal_block_size)+
") is not a multiple of journal device sector size ("+std::to_string(journal_device_sect)+")"
);
}
}

View File

@ -32,9 +32,9 @@ int blockstore_impl_t::fulfill_read_push(blockstore_op_t *op, void *buf, uint64_
PRIV(op)->pending_ops++;
my_uring_prep_readv(
sqe,
IS_JOURNAL(item_state) ? journal.fd : data_fd,
IS_JOURNAL(item_state) ? dsk.journal_fd : dsk.data_fd,
&data->iov, 1,
(IS_JOURNAL(item_state) ? journal.offset : data_offset) + offset
(IS_JOURNAL(item_state) ? dsk.journal_offset : dsk.data_offset) + offset
);
data->callback = [this, op](ring_data_t *data) { handle_read_event(data, op); };
return 1;
@ -97,15 +97,15 @@ endwhile:
uint8_t* blockstore_impl_t::get_clean_entry_bitmap(uint64_t block_loc, int offset)
{
uint8_t *clean_entry_bitmap;
uint64_t meta_loc = block_loc >> block_order;
uint64_t meta_loc = block_loc >> dsk.block_order;
if (inmemory_meta)
{
uint64_t sector = (meta_loc / (meta_block_size / clean_entry_size)) * meta_block_size;
uint64_t pos = (meta_loc % (meta_block_size / clean_entry_size));
clean_entry_bitmap = ((uint8_t*)metadata_buffer + sector + pos*clean_entry_size + sizeof(clean_disk_entry) + offset);
uint64_t sector = (meta_loc / (dsk.meta_block_size / dsk.clean_entry_size)) * dsk.meta_block_size;
uint64_t pos = (meta_loc % (dsk.meta_block_size / dsk.clean_entry_size));
clean_entry_bitmap = ((uint8_t*)metadata_buffer + sector + pos*dsk.clean_entry_size + sizeof(clean_disk_entry) + offset);
}
else
clean_entry_bitmap = (uint8_t*)(clean_bitmap + meta_loc*2*clean_entry_bitmap_size + offset);
clean_entry_bitmap = (uint8_t*)(clean_bitmap + meta_loc*2*dsk.clean_entry_bitmap_size + offset);
return clean_entry_bitmap;
}
@ -152,8 +152,8 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
result_version = dirty_it->first.version;
if (read_op->bitmap)
{
void *bmp_ptr = (clean_entry_bitmap_size > sizeof(void*) ? dirty_it->second.bitmap : &dirty_it->second.bitmap);
memcpy(read_op->bitmap, bmp_ptr, clean_entry_bitmap_size);
void *bmp_ptr = (dsk.clean_entry_bitmap_size > sizeof(void*) ? dirty_it->second.bitmap : &dirty_it->second.bitmap);
memcpy(read_op->bitmap, bmp_ptr, dsk.clean_entry_bitmap_size);
}
}
if (!fulfill_read(read_op, fulfilled, dirty.offset, dirty.offset + dirty.len,
@ -178,15 +178,15 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
result_version = clean_it->second.version;
if (read_op->bitmap)
{
void *bmp_ptr = get_clean_entry_bitmap(clean_it->second.location, clean_entry_bitmap_size);
memcpy(read_op->bitmap, bmp_ptr, clean_entry_bitmap_size);
void *bmp_ptr = get_clean_entry_bitmap(clean_it->second.location, dsk.clean_entry_bitmap_size);
memcpy(read_op->bitmap, bmp_ptr, dsk.clean_entry_bitmap_size);
}
}
if (fulfilled < read_op->len)
{
if (!clean_entry_bitmap_size)
if (!dsk.clean_entry_bitmap_size)
{
if (!fulfill_read(read_op, fulfilled, 0, data_block_size, (BS_ST_BIG_WRITE | BS_ST_STABLE), 0, clean_it->second.location))
if (!fulfill_read(read_op, fulfilled, 0, dsk.data_block_size, (BS_ST_BIG_WRITE | BS_ST_STABLE), 0, clean_it->second.location))
{
// need to wait. undo added requests, don't dequeue op
PRIV(read_op)->read_vec.clear();
@ -196,7 +196,7 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
else
{
uint8_t *clean_entry_bitmap = get_clean_entry_bitmap(clean_it->second.location, 0);
uint64_t bmp_start = 0, bmp_end = 0, bmp_size = data_block_size/bitmap_granularity;
uint64_t bmp_start = 0, bmp_end = 0, bmp_size = dsk.data_block_size/dsk.bitmap_granularity;
while (bmp_start < bmp_size)
{
while (!(clean_entry_bitmap[bmp_end >> 3] & (1 << (bmp_end & 0x7))) && bmp_end < bmp_size)
@ -206,8 +206,8 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
if (bmp_end > bmp_start)
{
// fill with zeroes
assert(fulfill_read(read_op, fulfilled, bmp_start * bitmap_granularity,
bmp_end * bitmap_granularity, (BS_ST_DELETE | BS_ST_STABLE), 0, 0));
assert(fulfill_read(read_op, fulfilled, bmp_start * dsk.bitmap_granularity,
bmp_end * dsk.bitmap_granularity, (BS_ST_DELETE | BS_ST_STABLE), 0, 0));
}
bmp_start = bmp_end;
while (clean_entry_bitmap[bmp_end >> 3] & (1 << (bmp_end & 0x7)) && bmp_end < bmp_size)
@ -216,9 +216,9 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
}
if (bmp_end > bmp_start)
{
if (!fulfill_read(read_op, fulfilled, bmp_start * bitmap_granularity,
bmp_end * bitmap_granularity, (BS_ST_BIG_WRITE | BS_ST_STABLE), 0,
clean_it->second.location + bmp_start * bitmap_granularity))
if (!fulfill_read(read_op, fulfilled, bmp_start * dsk.bitmap_granularity,
bmp_end * dsk.bitmap_granularity, (BS_ST_BIG_WRITE | BS_ST_STABLE), 0,
clean_it->second.location + bmp_start * dsk.bitmap_granularity))
{
// need to wait. undo added requests, don't dequeue op
PRIV(read_op)->read_vec.clear();
@ -233,7 +233,7 @@ int blockstore_impl_t::dequeue_read(blockstore_op_t *read_op)
else if (fulfilled < read_op->len)
{
// fill remaining parts with zeroes
assert(fulfill_read(read_op, fulfilled, 0, data_block_size, (BS_ST_DELETE | BS_ST_STABLE), 0, 0));
assert(fulfill_read(read_op, fulfilled, 0, dsk.data_block_size, (BS_ST_DELETE | BS_ST_STABLE), 0, 0));
}
assert(fulfilled == read_op->len);
read_op->version = result_version;
@ -288,8 +288,8 @@ int blockstore_impl_t::read_bitmap(object_id oid, uint64_t target_version, void
*result_version = dirty_it->first.version;
if (bitmap)
{
void *bmp_ptr = (clean_entry_bitmap_size > sizeof(void*) ? dirty_it->second.bitmap : &dirty_it->second.bitmap);
memcpy(bitmap, bmp_ptr, clean_entry_bitmap_size);
void *bmp_ptr = (dsk.clean_entry_bitmap_size > sizeof(void*) ? dirty_it->second.bitmap : &dirty_it->second.bitmap);
memcpy(bitmap, bmp_ptr, dsk.clean_entry_bitmap_size);
}
return 0;
}
@ -306,14 +306,14 @@ int blockstore_impl_t::read_bitmap(object_id oid, uint64_t target_version, void
*result_version = clean_it->second.version;
if (bitmap)
{
void *bmp_ptr = get_clean_entry_bitmap(clean_it->second.location, clean_entry_bitmap_size);
memcpy(bitmap, bmp_ptr, clean_entry_bitmap_size);
void *bmp_ptr = get_clean_entry_bitmap(clean_it->second.location, dsk.clean_entry_bitmap_size);
memcpy(bitmap, bmp_ptr, dsk.clean_entry_bitmap_size);
}
return 0;
}
if (result_version)
*result_version = 0;
if (bitmap)
memset(bitmap, 0, clean_entry_bitmap_size);
memset(bitmap, 0, dsk.clean_entry_bitmap_size);
return -ENOENT;
}

View File

@ -112,7 +112,7 @@ resume_2:
if (!disable_journal_fsync)
{
BS_SUBMIT_GET_SQE(sqe, data);
my_uring_prep_fsync(sqe, journal.fd, IORING_FSYNC_DATASYNC);
my_uring_prep_fsync(sqe, dsk.journal_fd, IORING_FSYNC_DATASYNC);
data->iov = { 0 };
data->callback = [this, op](ring_data_t *data) { handle_write_event(data, op); };
PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 0;
@ -217,10 +217,10 @@ void blockstore_impl_t::erase_dirty(blockstore_dirty_db_t::iterator dirty_start,
dirty_it->second.location != UINT64_MAX)
{
#ifdef BLOCKSTORE_DEBUG
printf("Free block %lu from %lx:%lx v%lu\n", dirty_it->second.location >> block_order,
printf("Free block %lu from %lx:%lx v%lu\n", dirty_it->second.location >> dsk.block_order,
dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version);
#endif
data_alloc->set(dirty_it->second.location >> block_order, false);
data_alloc->set(dirty_it->second.location >> dsk.block_order, false);
}
int used = --journal.used_sectors[dirty_it->second.journal_sector];
#ifdef BLOCKSTORE_DEBUG
@ -233,7 +233,7 @@ void blockstore_impl_t::erase_dirty(blockstore_dirty_db_t::iterator dirty_start,
{
journal.used_sectors.erase(dirty_it->second.journal_sector);
}
if (clean_entry_bitmap_size > sizeof(void*))
if (dsk.clean_entry_bitmap_size > sizeof(void*))
{
free(dirty_it->second.bitmap);
dirty_it->second.bitmap = NULL;

View File

@ -137,7 +137,7 @@ resume_2:
if (!disable_journal_fsync)
{
BS_SUBMIT_GET_SQE(sqe, data);
my_uring_prep_fsync(sqe, journal.fd, IORING_FSYNC_DATASYNC);
my_uring_prep_fsync(sqe, dsk.journal_fd, IORING_FSYNC_DATASYNC);
data->iov = { 0 };
data->callback = [this, op](ring_data_t *data) { handle_write_event(data, op); };
PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 0;
@ -195,14 +195,14 @@ void blockstore_impl_t::mark_stable(const obj_ver_id & v, bool forget_dirty)
}
if (!exists)
{
inode_space_stats[dirty_it->first.oid.inode] += data_block_size;
inode_space_stats[dirty_it->first.oid.inode] += dsk.data_block_size;
}
}
else if (IS_DELETE(dirty_it->second.state))
{
auto & sp = inode_space_stats[dirty_it->first.oid.inode];
if (sp > data_block_size)
sp -= data_block_size;
if (sp > dsk.data_block_size)
sp -= dsk.data_block_size;
else
inode_space_stats.erase(dirty_it->first.oid.inode);
}

View File

@ -60,7 +60,7 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op, bool queue_has_in_prog
if (!disable_data_fsync)
{
BS_SUBMIT_GET_SQE(sqe, data);
my_uring_prep_fsync(sqe, data_fd, IORING_FSYNC_DATASYNC);
my_uring_prep_fsync(sqe, dsk.data_fd, IORING_FSYNC_DATASYNC);
data->iov = { 0 };
data->callback = [this, op](ring_data_t *data) { handle_write_event(data, op); };
PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 0;
@ -79,7 +79,7 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op, bool queue_has_in_prog
// Check space in the journal and journal memory buffers
blockstore_journal_check_t space_check(this);
if (!space_check.check_available(op, PRIV(op)->sync_big_writes.size(),
sizeof(journal_entry_big_write) + clean_entry_bitmap_size, JOURNAL_STABILIZE_RESERVATION))
sizeof(journal_entry_big_write) + dsk.clean_entry_bitmap_size, JOURNAL_STABILIZE_RESERVATION))
{
return 0;
}
@ -90,7 +90,7 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op, bool queue_has_in_prog
int s = 0;
while (it != PRIV(op)->sync_big_writes.end())
{
if (!journal.entry_fits(sizeof(journal_entry_big_write) + clean_entry_bitmap_size) &&
if (!journal.entry_fits(sizeof(journal_entry_big_write) + dsk.clean_entry_bitmap_size) &&
journal.sector_info[journal.cur_sector].dirty)
{
prepare_journal_sector_write(journal.cur_sector, op);
@ -99,7 +99,7 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op, bool queue_has_in_prog
auto & dirty_entry = dirty_db.at(*it);
journal_entry_big_write *je = (journal_entry_big_write*)prefill_single_journal_entry(
journal, (dirty_entry.state & BS_ST_INSTANT) ? JE_BIG_WRITE_INSTANT : JE_BIG_WRITE,
sizeof(journal_entry_big_write) + clean_entry_bitmap_size
sizeof(journal_entry_big_write) + dsk.clean_entry_bitmap_size
);
dirty_entry.journal_sector = journal.sector_info[journal.cur_sector].offset;
journal.used_sectors[journal.sector_info[journal.cur_sector].offset]++;
@ -115,8 +115,8 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op, bool queue_has_in_prog
je->offset = dirty_entry.offset;
je->len = dirty_entry.len;
je->location = dirty_entry.location;
memcpy((void*)(je+1), (clean_entry_bitmap_size > sizeof(void*)
? dirty_entry.bitmap : &dirty_entry.bitmap), clean_entry_bitmap_size);
memcpy((void*)(je+1), (dsk.clean_entry_bitmap_size > sizeof(void*)
? dirty_entry.bitmap : &dirty_entry.bitmap), dsk.clean_entry_bitmap_size);
je->crc32 = je_crc32((journal_entry*)je);
journal.crc32_last = je->crc32;
it++;
@ -132,7 +132,7 @@ int blockstore_impl_t::continue_sync(blockstore_op_t *op, bool queue_has_in_prog
if (!disable_journal_fsync)
{
BS_SUBMIT_GET_SQE(sqe, data);
my_uring_prep_fsync(sqe, journal.fd, IORING_FSYNC_DATASYNC);
my_uring_prep_fsync(sqe, dsk.journal_fd, IORING_FSYNC_DATASYNC);
data->iov = { 0 };
data->callback = [this, op](ring_data_t *data) { handle_write_event(data, op); };
PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 0;

View File

@ -10,9 +10,9 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
bool wait_big = false, wait_del = false;
void *bmp = NULL;
uint64_t version = 1;
if (!is_del && clean_entry_bitmap_size > sizeof(void*))
if (!is_del && dsk.clean_entry_bitmap_size > sizeof(void*))
{
bmp = calloc_or_die(1, clean_entry_bitmap_size);
bmp = calloc_or_die(1, dsk.clean_entry_bitmap_size);
}
if (dirty_db.size() > 0)
{
@ -32,8 +32,8 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
: ((dirty_it->second.state & BS_ST_WORKFLOW_MASK) == BS_ST_WAIT_BIG);
if (!is_del && !deleted)
{
if (clean_entry_bitmap_size > sizeof(void*))
memcpy(bmp, dirty_it->second.bitmap, clean_entry_bitmap_size);
if (dsk.clean_entry_bitmap_size > sizeof(void*))
memcpy(bmp, dirty_it->second.bitmap, dsk.clean_entry_bitmap_size);
else
bmp = dirty_it->second.bitmap;
}
@ -48,8 +48,8 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
version = clean_it->second.version + 1;
if (!is_del)
{
void *bmp_ptr = get_clean_entry_bitmap(clean_it->second.location, clean_entry_bitmap_size);
memcpy((clean_entry_bitmap_size > sizeof(void*) ? bmp : &bmp), bmp_ptr, clean_entry_bitmap_size);
void *bmp_ptr = get_clean_entry_bitmap(clean_it->second.location, dsk.clean_entry_bitmap_size);
memcpy((dsk.clean_entry_bitmap_size > sizeof(void*) ? bmp : &bmp), bmp_ptr, dsk.clean_entry_bitmap_size);
}
}
else
@ -90,14 +90,14 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
{
// Invalid version requested
op->retval = -EEXIST;
if (!is_del && clean_entry_bitmap_size > sizeof(void*))
if (!is_del && dsk.clean_entry_bitmap_size > sizeof(void*))
{
free(bmp);
}
return false;
}
}
if (wait_big && !is_del && !deleted && op->len < data_block_size &&
if (wait_big && !is_del && !deleted && op->len < dsk.data_block_size &&
immediate_commit != IMMEDIATE_ALL)
{
// Issue an additional sync so that the previous big write can reach the journal
@ -122,7 +122,7 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
state = BS_ST_DELETE | BS_ST_IN_FLIGHT;
else
{
state = (op->len == data_block_size || deleted ? BS_ST_BIG_WRITE : BS_ST_SMALL_WRITE);
state = (op->len == dsk.data_block_size || deleted ? BS_ST_BIG_WRITE : BS_ST_SMALL_WRITE);
if (state == BS_ST_SMALL_WRITE && throttle_small_writes)
clock_gettime(CLOCK_REALTIME, &PRIV(op)->tv_begin);
if (wait_del)
@ -136,9 +136,9 @@ bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
if (op->bitmap)
{
// Only allow to overwrite part of the object bitmap respective to the write's offset/len
uint8_t *bmp_ptr = (uint8_t*)(clean_entry_bitmap_size > sizeof(void*) ? bmp : &bmp);
uint32_t bit = op->offset/bitmap_granularity;
uint32_t bits_left = op->len/bitmap_granularity;
uint8_t *bmp_ptr = (uint8_t*)(dsk.clean_entry_bitmap_size > sizeof(void*) ? bmp : &bmp);
uint32_t bit = op->offset/dsk.bitmap_granularity;
uint32_t bits_left = op->len/dsk.bitmap_granularity;
while (!(bit % 8) && bits_left > 8)
{
// Copy bytes
@ -175,7 +175,7 @@ void blockstore_impl_t::cancel_all_writes(blockstore_op_t *op, blockstore_dirty_
{
while (dirty_it != dirty_db.end() && dirty_it->first.oid == op->oid)
{
if (clean_entry_bitmap_size > sizeof(void*))
if (dsk.clean_entry_bitmap_size > sizeof(void*))
free(dirty_it->second.bitmap);
dirty_db.erase(dirty_it++);
}
@ -251,7 +251,7 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
{
blockstore_journal_check_t space_check(this);
if (!space_check.check_available(op, unsynced_big_write_count + 1,
sizeof(journal_entry_big_write) + clean_entry_bitmap_size, JOURNAL_STABILIZE_RESERVATION))
sizeof(journal_entry_big_write) + dsk.clean_entry_bitmap_size, JOURNAL_STABILIZE_RESERVATION))
{
return 0;
}
@ -271,7 +271,7 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
}
BS_SUBMIT_GET_SQE(sqe, data);
write_iodepth++;
dirty_it->second.location = loc << block_order;
dirty_it->second.location = loc << dsk.block_order;
dirty_it->second.state = (dirty_it->second.state & ~BS_ST_WORKFLOW_MASK) | BS_ST_SUBMITTED;
#ifdef BLOCKSTORE_DEBUG
printf(
@ -280,9 +280,9 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
);
#endif
data_alloc->set(loc, true);
uint64_t stripe_offset = (op->offset % bitmap_granularity);
uint64_t stripe_end = (op->offset + op->len) % bitmap_granularity;
// Zero fill up to bitmap_granularity
uint64_t stripe_offset = (op->offset % dsk.bitmap_granularity);
uint64_t stripe_end = (op->offset + op->len) % dsk.bitmap_granularity;
// Zero fill up to dsk.bitmap_granularity
int vcnt = 0;
if (stripe_offset)
{
@ -291,13 +291,13 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
PRIV(op)->iov_zerofill[vcnt++] = (struct iovec){ op->buf, op->len };
if (stripe_end)
{
stripe_end = bitmap_granularity - stripe_end;
stripe_end = dsk.bitmap_granularity - stripe_end;
PRIV(op)->iov_zerofill[vcnt++] = (struct iovec){ zero_object, stripe_end };
}
data->iov.iov_len = op->len + stripe_offset + stripe_end; // to check it in the callback
data->callback = [this, op](ring_data_t *data) { handle_write_event(data, op); };
my_uring_prep_writev(
sqe, data_fd, PRIV(op)->iov_zerofill, vcnt, data_offset + (loc << block_order) + op->offset - stripe_offset
sqe, dsk.data_fd, PRIV(op)->iov_zerofill, vcnt, dsk.data_offset + (loc << dsk.block_order) + op->offset - stripe_offset
);
PRIV(op)->pending_ops = 1;
PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 0;
@ -319,9 +319,9 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
blockstore_journal_check_t space_check(this);
if (unsynced_big_write_count &&
!space_check.check_available(op, unsynced_big_write_count,
sizeof(journal_entry_big_write) + clean_entry_bitmap_size, 0)
sizeof(journal_entry_big_write) + dsk.clean_entry_bitmap_size, 0)
|| !space_check.check_available(op, 1,
sizeof(journal_entry_small_write) + clean_entry_bitmap_size, op->len + JOURNAL_STABILIZE_RESERVATION))
sizeof(journal_entry_small_write) + dsk.clean_entry_bitmap_size, op->len + JOURNAL_STABILIZE_RESERVATION))
{
return 0;
}
@ -329,7 +329,7 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
BS_SUBMIT_CHECK_SQES(
// Write current journal sector only if it's dirty and full, or in the immediate_commit mode
(immediate_commit != IMMEDIATE_NONE ||
!journal.entry_fits(sizeof(journal_entry_small_write) + clean_entry_bitmap_size) ? 1 : 0) +
!journal.entry_fits(sizeof(journal_entry_small_write) + dsk.clean_entry_bitmap_size) ? 1 : 0) +
(op->len > 0 ? 1 : 0)
);
write_iodepth++;
@ -337,7 +337,7 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
auto cb = [this, op](ring_data_t *data) { handle_write_event(data, op); };
if (immediate_commit == IMMEDIATE_NONE)
{
if (!journal.entry_fits(sizeof(journal_entry_small_write) + clean_entry_bitmap_size))
if (!journal.entry_fits(sizeof(journal_entry_small_write) + dsk.clean_entry_bitmap_size))
{
prepare_journal_sector_write(journal.cur_sector, op);
}
@ -349,7 +349,7 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
// Then pre-fill journal entry
journal_entry_small_write *je = (journal_entry_small_write*)prefill_single_journal_entry(
journal, op->opcode == BS_OP_WRITE_STABLE ? JE_SMALL_WRITE_INSTANT : JE_SMALL_WRITE,
sizeof(journal_entry_small_write) + clean_entry_bitmap_size
sizeof(journal_entry_small_write) + dsk.clean_entry_bitmap_size
);
dirty_it->second.journal_sector = journal.sector_info[journal.cur_sector].offset;
journal.used_sectors[journal.sector_info[journal.cur_sector].offset]++;
@ -361,14 +361,14 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
);
#endif
// Figure out where data will be
journal.next_free = (journal.next_free + op->len) <= journal.len ? journal.next_free : journal_block_size;
journal.next_free = (journal.next_free + op->len) <= journal.len ? journal.next_free : dsk.journal_block_size;
je->oid = op->oid;
je->version = op->version;
je->offset = op->offset;
je->len = op->len;
je->data_offset = journal.next_free;
je->crc32_data = crc32c(0, op->buf, op->len);
memcpy((void*)(je+1), (clean_entry_bitmap_size > sizeof(void*) ? dirty_it->second.bitmap : &dirty_it->second.bitmap), clean_entry_bitmap_size);
memcpy((void*)(je+1), (dsk.clean_entry_bitmap_size > sizeof(void*) ? dirty_it->second.bitmap : &dirty_it->second.bitmap), dsk.clean_entry_bitmap_size);
je->crc32 = je_crc32((journal_entry*)je);
journal.crc32_last = je->crc32;
if (immediate_commit != IMMEDIATE_NONE)
@ -387,7 +387,7 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
data2->iov = (struct iovec){ op->buf, op->len };
data2->callback = cb;
my_uring_prep_writev(
sqe2, journal.fd, &data2->iov, 1, journal.offset + journal.next_free
sqe2, dsk.journal_fd, &data2->iov, 1, journal.offset + journal.next_free
);
PRIV(op)->pending_ops++;
}
@ -400,7 +400,7 @@ int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
journal.next_free += op->len;
if (journal.next_free >= journal.len)
{
journal.next_free = journal_block_size;
journal.next_free = dsk.journal_block_size;
}
if (!PRIV(op)->pending_ops)
{
@ -440,7 +440,7 @@ resume_2:
assert(dirty_it != dirty_db.end());
journal_entry_big_write *je = (journal_entry_big_write*)prefill_single_journal_entry(
journal, op->opcode == BS_OP_WRITE_STABLE ? JE_BIG_WRITE_INSTANT : JE_BIG_WRITE,
sizeof(journal_entry_big_write) + clean_entry_bitmap_size
sizeof(journal_entry_big_write) + dsk.clean_entry_bitmap_size
);
dirty_it->second.journal_sector = journal.sector_info[journal.cur_sector].offset;
journal.used_sectors[journal.sector_info[journal.cur_sector].offset]++;
@ -456,7 +456,7 @@ resume_2:
je->offset = op->offset;
je->len = op->len;
je->location = dirty_it->second.location;
memcpy((void*)(je+1), (clean_entry_bitmap_size > sizeof(void*) ? dirty_it->second.bitmap : &dirty_it->second.bitmap), clean_entry_bitmap_size);
memcpy((void*)(je+1), (dsk.clean_entry_bitmap_size > sizeof(void*) ? dirty_it->second.bitmap : &dirty_it->second.bitmap), dsk.clean_entry_bitmap_size);
je->crc32 = je_crc32((journal_entry*)je);
journal.crc32_last = je->crc32;
prepare_journal_sector_write(journal.cur_sector, op);
@ -634,7 +634,7 @@ int blockstore_impl_t::dequeue_del(blockstore_op_t *op)
// Write current journal sector only if it's dirty and full, or in the immediate_commit mode
BS_SUBMIT_CHECK_SQES(
(immediate_commit != IMMEDIATE_NONE ||
(journal_block_size - journal.in_sector_pos) < sizeof(journal_entry_del) &&
(dsk.journal_block_size - journal.in_sector_pos) < sizeof(journal_entry_del) &&
journal.sector_info[journal.cur_sector].dirty) ? 1 : 0
);
if (write_iodepth >= max_write_iodepth)
@ -645,7 +645,7 @@ int blockstore_impl_t::dequeue_del(blockstore_op_t *op)
// Prepare journal sector write
if (immediate_commit == IMMEDIATE_NONE)
{
if ((journal_block_size - journal.in_sector_pos) < sizeof(journal_entry_del) &&
if ((dsk.journal_block_size - journal.in_sector_pos) < sizeof(journal_entry_del) &&
journal.sector_info[journal.cur_sector].dirty)
{
prepare_journal_sector_write(journal.cur_sector, op);