Compare commits

...

9 Commits

  1. 2
      debian/vitastor-osd.install
  2. 2
      rpm/vitastor-el7.spec
  3. 2
      rpm/vitastor-el8.spec
  4. 18
      src/CMakeLists.txt
  5. 10
      src/blockstore.h
  6. 322
      src/blockstore_disk.cpp
  7. 42
      src/blockstore_disk.h
  8. 78
      src/blockstore_flush.cpp
  9. 33
      src/blockstore_impl.cpp
  10. 37
      src/blockstore_impl.h
  11. 101
      src/blockstore_init.cpp
  12. 2
      src/blockstore_init.h
  13. 2
      src/blockstore_journal.cpp
  14. 1
      src/blockstore_journal.h
  15. 322
      src/blockstore_open.cpp
  16. 50
      src/blockstore_read.cpp
  17. 8
      src/blockstore_rollback.cpp
  18. 8
      src/blockstore_stable.cpp
  19. 14
      src/blockstore_sync.cpp
  20. 66
      src/blockstore_write.cpp
  21. 2
      src/cli_simple_offsets.cpp
  22. 2
      src/cluster_client.cpp
  23. 4
      src/cluster_client.h
  24. 991
      src/disk_tool.cpp
  25. 224
      src/dump_journal.cpp
  26. 169
      src/dump_meta.cpp

2
debian/vitastor-osd.install

@ -1,3 +1,3 @@
usr/bin/vitastor-osd
usr/bin/vitastor-disk
usr/bin/vitastor-dump-journal
usr/bin/vitastor-dump-meta

2
rpm/vitastor-el7.spec

@ -112,8 +112,8 @@ cp -r mon %buildroot/usr/lib/vitastor
%files -n vitastor-osd
%_bindir/vitastor-osd
%_bindir/vitastor-disk
%_bindir/vitastor-dump-journal
%_bindir/vitastor-dump-meta
%files -n vitastor-mon

2
rpm/vitastor-el8.spec

@ -109,8 +109,8 @@ cp -r mon %buildroot/usr/lib/vitastor
%files -n vitastor-osd
%_bindir/vitastor-osd
%_bindir/vitastor-disk
%_bindir/vitastor-dump-journal
%_bindir/vitastor-dump-meta
%files -n vitastor-mon

18
src/CMakeLists.txt

@ -64,7 +64,7 @@ include_directories(
# libvitastor_blk.so
add_library(vitastor_blk SHARED
allocator.cpp blockstore.cpp blockstore_impl.cpp blockstore_init.cpp blockstore_open.cpp blockstore_journal.cpp blockstore_read.cpp
allocator.cpp blockstore.cpp blockstore_impl.cpp blockstore_disk.cpp blockstore_init.cpp blockstore_open.cpp blockstore_journal.cpp blockstore_read.cpp
blockstore_write.cpp blockstore_sync.cpp blockstore_stable.cpp blockstore_rollback.cpp blockstore_flush.cpp crc32c.c ringloop.cpp
)
target_link_libraries(vitastor_blk
@ -193,14 +193,13 @@ target_link_libraries(vitastor-cli
)
configure_file(vitastor.pc.in vitastor.pc @ONLY)
# vitastor-dump-journal
add_executable(vitastor-dump-journal
dump_journal.cpp crc32c.c
# vitastor-disk
add_executable(vitastor-disk
disk_tool.cpp crc32c.c rw_blocking.cpp allocator.cpp ringloop.cpp blockstore_disk.cpp
)
# vitastor-dump-meta
add_executable(vitastor-dump-meta
dump_meta.cpp rw_blocking.cpp
target_link_libraries(vitastor-disk
tcmalloc_minimal
${LIBURING_LIBRARIES}
)
if (${WITH_QEMU})
@ -280,7 +279,8 @@ target_include_directories(test_cluster_client PUBLIC ${CMAKE_SOURCE_DIR}/src/mo
### Install
install(TARGETS vitastor-osd vitastor-dump-journal vitastor-dump-meta vitastor-nbd vitastor-nfs vitastor-cli RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
install(TARGETS vitastor-osd vitastor-disk vitastor-nbd vitastor-nfs vitastor-cli RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
install_symlink(vitastor-disk ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_BINDIR}/vitastor-dump-journal)
install_symlink(vitastor-cli ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_BINDIR}/vitastor-rm)
install_symlink(vitastor-cli ${CMAKE_INSTALL_PREFIX}/${CMAKE_INSTALL_BINDIR}/vita)
install(

10
src/blockstore.h

@ -11,7 +11,6 @@
#include <string>
#include <map>
#include <unordered_map>
#include <functional>
#include "object_id.h"
@ -29,9 +28,9 @@
#endif
// Default block size is 128 KB, current allowed range is 4K - 128M
#define DEFAULT_ORDER 17
#define MIN_BLOCK_SIZE 4*1024
#define MAX_BLOCK_SIZE 128*1024*1024
#define DEFAULT_DATA_BLOCK_ORDER 17
#define MIN_DATA_BLOCK_SIZE 4*1024
#define MAX_DATA_BLOCK_SIZE 128*1024*1024
#define DEFAULT_BITMAP_GRANULARITY 4096
#define BS_OP_MIN 1
@ -155,7 +154,7 @@ struct blockstore_op_t
uint8_t private_data[BS_OP_PRIVATE_DATA_SIZE];
};
typedef std::unordered_map<std::string, std::string> blockstore_config_t;
typedef std::map<std::string, std::string> blockstore_config_t;
class blockstore_impl_t;
@ -193,7 +192,6 @@ public:
// Print diagnostics to stdout
void dump_diagnostics();
// FIXME rename to object_size
uint32_t get_block_size();
uint64_t get_block_count();
uint64_t get_free_block_count();

322
src/blockstore_disk.cpp

@ -0,0 +1,322 @@
// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 (see README.md for details)
#include <sys/file.h>
#include <stdexcept>
#include "blockstore_impl.h"
#include "blockstore_disk.h"
static uint32_t is_power_of_two(uint64_t value)
{
uint32_t l = 0;
while (value > 1)
{
if (value & 1)
{
return 64;
}
value = value >> 1;
l++;
}
return l;
}
void blockstore_disk_t::parse_config(std::map<std::string, std::string> & config)
{
// Parse
if (config["disable_device_lock"] == "true" || config["disable_device_lock"] == "1" || config["disable_device_lock"] == "yes")
{
disable_flock = true;
}
cfg_journal_size = strtoull(config["journal_size"].c_str(), NULL, 10);
data_device = config["data_device"];
data_offset = strtoull(config["data_offset"].c_str(), NULL, 10);
cfg_data_size = strtoull(config["data_size"].c_str(), NULL, 10);
meta_device = config["meta_device"];
meta_offset = strtoull(config["meta_offset"].c_str(), NULL, 10);
data_block_size = strtoull(config["block_size"].c_str(), NULL, 10);
journal_device = config["journal_device"];
journal_offset = strtoull(config["journal_offset"].c_str(), NULL, 10);
disk_alignment = strtoull(config["disk_alignment"].c_str(), NULL, 10);
journal_block_size = strtoull(config["journal_block_size"].c_str(), NULL, 10);
meta_block_size = strtoull(config["meta_block_size"].c_str(), NULL, 10);
bitmap_granularity = strtoull(config["bitmap_granularity"].c_str(), NULL, 10);
// Validate
if (!data_block_size)
{
data_block_size = (1 << DEFAULT_DATA_BLOCK_ORDER);
}
if ((block_order = is_power_of_two(data_block_size)) >= 64 || data_block_size < MIN_DATA_BLOCK_SIZE || data_block_size >= MAX_DATA_BLOCK_SIZE)
{
throw std::runtime_error("Bad block size");
}
if (!disk_alignment)
{
disk_alignment = 4096;
}
else if (disk_alignment % DIRECT_IO_ALIGNMENT)
{
throw std::runtime_error("disk_alignment must be a multiple of "+std::to_string(DIRECT_IO_ALIGNMENT));
}
if (!journal_block_size)
{
journal_block_size = 4096;
}
else if (journal_block_size % DIRECT_IO_ALIGNMENT)
{
throw std::runtime_error("journal_block_size must be a multiple of "+std::to_string(DIRECT_IO_ALIGNMENT));
}
if (!meta_block_size)
{
meta_block_size = 4096;
}
else if (meta_block_size % DIRECT_IO_ALIGNMENT)
{
throw std::runtime_error("meta_block_size must be a multiple of "+std::to_string(DIRECT_IO_ALIGNMENT));
}
if (data_offset % disk_alignment)
{
throw std::runtime_error("data_offset must be a multiple of disk_alignment = "+std::to_string(disk_alignment));
}
if (!bitmap_granularity)
{
bitmap_granularity = DEFAULT_BITMAP_GRANULARITY;
}
else if (bitmap_granularity % disk_alignment)
{
throw std::runtime_error("Sparse write tracking granularity must be a multiple of disk_alignment = "+std::to_string(disk_alignment));
}
if (data_block_size % bitmap_granularity)
{
throw std::runtime_error("Block size must be a multiple of sparse write tracking granularity");
}
if (meta_device == "")
{
meta_device = data_device;
}
if (journal_device == "")
{
journal_device = meta_device;
}
if (meta_offset % meta_block_size)
{
throw std::runtime_error("meta_offset must be a multiple of meta_block_size = "+std::to_string(meta_block_size));
}
if (journal_offset % journal_block_size)
{
throw std::runtime_error("journal_offset must be a multiple of journal_block_size = "+std::to_string(journal_block_size));
}
clean_entry_bitmap_size = data_block_size / bitmap_granularity / 8;
clean_entry_size = sizeof(clean_disk_entry) + 2*clean_entry_bitmap_size;
}
void blockstore_disk_t::calc_lengths()
{
// data
data_len = data_device_size - data_offset;
if (data_fd == meta_fd && data_offset < meta_offset)
{
data_len = meta_offset - data_offset;
}
if (data_fd == journal_fd && data_offset < journal_offset)
{
data_len = data_len < journal_offset-data_offset
? data_len : journal_offset-data_offset;
}
if (cfg_data_size != 0)
{
if (data_len < cfg_data_size)
{
throw std::runtime_error("Data area ("+std::to_string(data_len)+
" bytes) is smaller than configured size ("+std::to_string(cfg_data_size)+" bytes)");
}
data_len = cfg_data_size;
}
// meta
uint64_t meta_area_size = (meta_fd == data_fd ? data_device_size : meta_device_size) - meta_offset;
if (meta_fd == data_fd && meta_offset <= data_offset)
{
meta_area_size = data_offset - meta_offset;
}
if (meta_fd == journal_fd && meta_offset <= journal_offset)
{
meta_area_size = meta_area_size < journal_offset-meta_offset
? meta_area_size : journal_offset-meta_offset;
}
// journal
journal_len = (journal_fd == data_fd ? data_device_size : (journal_fd == meta_fd ? meta_device_size : journal_device_size)) - journal_offset;
if (journal_fd == data_fd && journal_offset <= data_offset)
{
journal_len = data_offset - journal_offset;
}
if (journal_fd == meta_fd && journal_offset <= meta_offset)
{
journal_len = journal_len < meta_offset-journal_offset
? journal_len : meta_offset-journal_offset;
}
// required metadata size
block_count = data_len / data_block_size;
meta_len = (1 + (block_count - 1 + meta_block_size / clean_entry_size) / (meta_block_size / clean_entry_size)) * meta_block_size;
if (meta_area_size < meta_len)
{
throw std::runtime_error("Metadata area is too small, need at least "+std::to_string(meta_len)+" bytes");
}
// requested journal size
if (cfg_journal_size > journal_len)
{
throw std::runtime_error("Requested journal_size is too large");
}
else if (cfg_journal_size > 0)
{
journal_len = cfg_journal_size;
}
if (journal_len < MIN_JOURNAL_SIZE)
{
throw std::runtime_error("Journal is too small, need at least "+std::to_string(MIN_JOURNAL_SIZE)+" bytes");
}
}
static void check_size(int fd, uint64_t *size, uint64_t *sectsize, std::string name)
{
int sect;
struct stat st;
if (fstat(fd, &st) < 0)
{
throw std::runtime_error("Failed to stat "+name);
}
if (S_ISREG(st.st_mode))
{
*size = st.st_size;
if (sectsize)
{
*sectsize = st.st_blksize;
}
}
else if (S_ISBLK(st.st_mode))
{
if (ioctl(fd, BLKGETSIZE64, size) < 0 ||
ioctl(fd, BLKSSZGET, &sect) < 0)
{
throw std::runtime_error("Failed to get "+name+" size or block size: "+strerror(errno));
}
if (sectsize)
{
*sectsize = sect;
}
}
else
{
throw std::runtime_error(name+" is neither a file nor a block device");
}
}
void blockstore_disk_t::open_data()
{
data_fd = open(data_device.c_str(), O_DIRECT|O_RDWR);
if (data_fd == -1)
{
throw std::runtime_error("Failed to open data device");
}
check_size(data_fd, &data_device_size, &data_device_sect, "data device");
if (disk_alignment % data_device_sect)
{
throw std::runtime_error(
"disk_alignment ("+std::to_string(disk_alignment)+
") is not a multiple of data device sector size ("+std::to_string(data_device_sect)+")"
);
}
if (data_offset >= data_device_size)
{
throw std::runtime_error("data_offset exceeds device size = "+std::to_string(data_device_size));
}
if (!disable_flock && flock(data_fd, LOCK_EX|LOCK_NB) != 0)
{
throw std::runtime_error(std::string("Failed to lock data device: ") + strerror(errno));
}
}
void blockstore_disk_t::open_meta()
{
if (meta_device != data_device)
{
meta_offset = 0;
meta_fd = open(meta_device.c_str(), O_DIRECT|O_RDWR);
if (meta_fd == -1)
{
throw std::runtime_error("Failed to open metadata device");
}
check_size(meta_fd, &meta_device_size, &meta_device_sect, "metadata device");
if (meta_offset >= meta_device_size)
{
throw std::runtime_error("meta_offset exceeds device size = "+std::to_string(meta_device_size));
}
if (!disable_flock && flock(meta_fd, LOCK_EX|LOCK_NB) != 0)
{
throw std::runtime_error(std::string("Failed to lock metadata device: ") + strerror(errno));
}
}
else
{
meta_fd = data_fd;
meta_device_sect = data_device_sect;
meta_device_size = 0;
if (meta_offset >= data_device_size)
{
throw std::runtime_error("meta_offset exceeds device size = "+std::to_string(data_device_size));
}
}
if (meta_block_size % meta_device_sect)
{
throw std::runtime_error(
"meta_block_size ("+std::to_string(meta_block_size)+
") is not a multiple of data device sector size ("+std::to_string(meta_device_sect)+")"
);
}
}
void blockstore_disk_t::open_journal()
{
if (journal_device != meta_device)
{
journal_fd = open(journal_device.c_str(), O_DIRECT|O_RDWR);
if (journal_fd == -1)
{
throw std::runtime_error("Failed to open journal device");
}
check_size(journal_fd, &journal_device_size, &journal_device_sect, "journal device");
if (!disable_flock && flock(journal_fd, LOCK_EX|LOCK_NB) != 0)
{
throw std::runtime_error(std::string("Failed to lock journal device: ") + strerror(errno));
}
}
else
{
journal_fd = meta_fd;
journal_device_sect = meta_device_sect;
journal_device_size = 0;
if (journal_offset >= data_device_size)
{
throw std::runtime_error("journal_offset exceeds device size");
}
}
if (journal_block_size % journal_device_sect)
{
throw std::runtime_error(
"journal_block_size ("+std::to_string(journal_block_size)+
") is not a multiple of journal device sector size ("+std::to_string(journal_device_sect)+")"
);
}
}
void blockstore_disk_t::close_all()
{
if (data_fd >= 0)
close(data_fd);
if (meta_fd >= 0 && meta_fd != data_fd)
close(meta_fd);
if (journal_fd >= 0 && journal_fd != meta_fd)
close(journal_fd);
data_fd = meta_fd = journal_fd = -1;
}

42
src/blockstore_disk.h

@ -0,0 +1,42 @@
// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 (see README.md for details)
#pragma once
#include <stdint.h>
#include <string>
#include <map>
struct blockstore_disk_t
{
std::string data_device, meta_device, journal_device;
uint32_t data_block_size;
uint64_t cfg_journal_size, cfg_data_size;
// Required write alignment and journal/metadata/data areas' location alignment
uint32_t disk_alignment = 4096;
// Journal block size - minimum_io_size of the journal device is the best choice
uint64_t journal_block_size = 4096;
// Metadata block size - minimum_io_size of the metadata device is the best choice
uint64_t meta_block_size = 4096;
// Sparse write tracking granularity. 4 KB is a good choice. Must be a multiple of disk_alignment
uint64_t bitmap_granularity = 4096;
// By default, Blockstore locks all opened devices exclusively. This option can be used to disable locking
bool disable_flock = false;
int meta_fd = -1, data_fd = -1, journal_fd = -1;
uint64_t meta_offset, meta_device_sect, meta_device_size, meta_len;
uint64_t data_offset, data_device_sect, data_device_size, data_len;
uint64_t journal_offset, journal_device_sect, journal_device_size, journal_len;
uint32_t block_order;
uint64_t block_count;
uint32_t clean_entry_bitmap_size = 0, clean_entry_size = 0;
void parse_config(std::map<std::string, std::string> & config);
void open_data();
void open_meta();
void open_journal();
void calc_lengths();
void close_all();
};

78
src/blockstore_flush.cpp

@ -15,11 +15,11 @@ journal_flusher_t::journal_flusher_t(blockstore_impl_t *bs)
active_flushers = 0;
syncing_flushers = 0;
// FIXME: allow to configure flusher_start_threshold and journal_trim_interval
flusher_start_threshold = bs->journal_block_size / sizeof(journal_entry_stable);
flusher_start_threshold = bs->dsk.journal_block_size / sizeof(journal_entry_stable);
journal_trim_interval = 512;
journal_trim_counter = bs->journal.flush_journal ? 1 : 0;
trim_wanted = bs->journal.flush_journal ? 1 : 0;
journal_superblock = bs->journal.inmemory ? bs->journal.buffer : memalign_or_die(MEM_ALIGNMENT, bs->journal_block_size);
journal_superblock = bs->journal.inmemory ? bs->journal.buffer : memalign_or_die(MEM_ALIGNMENT, bs->dsk.journal_block_size);
co = new journal_flusher_co[max_flusher_count];
for (int i = 0; i < max_flusher_count; i++)
{
@ -486,28 +486,28 @@ resume_1:
bs->ringloop->wakeup();
}
// Reads completed, submit writes and set bitmap bits
if (bs->clean_entry_bitmap_size)
if (bs->dsk.clean_entry_bitmap_size)
{
new_clean_bitmap = (bs->inmemory_meta
? (uint8_t*)meta_new.buf + meta_new.pos*bs->clean_entry_size + sizeof(clean_disk_entry)
: (uint8_t*)bs->clean_bitmap + (clean_loc >> bs->block_order)*(2*bs->clean_entry_bitmap_size));
? (uint8_t*)meta_new.buf + meta_new.pos*bs->dsk.clean_entry_size + sizeof(clean_disk_entry)
: (uint8_t*)bs->clean_bitmap + (clean_loc >> bs->dsk.block_order)*(2*bs->dsk.clean_entry_bitmap_size));
if (clean_init_bitmap)
{
memset(new_clean_bitmap, 0, bs->clean_entry_bitmap_size);
bitmap_set(new_clean_bitmap, clean_bitmap_offset, clean_bitmap_len, bs->bitmap_granularity);
memset(new_clean_bitmap, 0, bs->dsk.clean_entry_bitmap_size);
bitmap_set(new_clean_bitmap, clean_bitmap_offset, clean_bitmap_len, bs->dsk.bitmap_granularity);
}
}
for (it = v.begin(); it != v.end(); it++)
{
if (new_clean_bitmap)
{
bitmap_set(new_clean_bitmap, it->offset, it->len, bs->bitmap_granularity);
bitmap_set(new_clean_bitmap, it->offset, it->len, bs->dsk.bitmap_granularity);
}
await_sqe(4);
data->iov = (struct iovec){ it->buf, (size_t)it->len };
data->callback = simple_callback_w;
my_uring_prep_writev(
sqe, bs->data_fd, &data->iov, 1, bs->data_offset + clean_loc + it->offset
sqe, bs->dsk.data_fd, &data->iov, 1, bs->dsk.data_offset + clean_loc + it->offset
);
wait_count++;
}
@ -536,35 +536,35 @@ resume_1:
return false;
}
// zero out old metadata entry
memset((uint8_t*)meta_old.buf + meta_old.pos*bs->clean_entry_size, 0, bs->clean_entry_size);
memset((uint8_t*)meta_old.buf + meta_old.pos*bs->dsk.clean_entry_size, 0, bs->dsk.clean_entry_size);
await_sqe(15);
data->iov = (struct iovec){ meta_old.buf, bs->meta_block_size };
data->iov = (struct iovec){ meta_old.buf, bs->dsk.meta_block_size };
data->callback = simple_callback_w;
my_uring_prep_writev(
sqe, bs->meta_fd, &data->iov, 1, bs->meta_offset + meta_old.sector
sqe, bs->dsk.meta_fd, &data->iov, 1, bs->dsk.meta_offset + meta_old.sector
);
wait_count++;
}
if (has_delete)
{
clean_disk_entry *new_entry = (clean_disk_entry*)((uint8_t*)meta_new.buf + meta_new.pos*bs->clean_entry_size);
clean_disk_entry *new_entry = (clean_disk_entry*)((uint8_t*)meta_new.buf + meta_new.pos*bs->dsk.clean_entry_size);
if (new_entry->oid.inode != 0 && new_entry->oid != cur.oid)
{
printf("Fatal error (metadata corruption or bug): tried to delete metadata entry %lu (%lx:%lx v%lu) while deleting %lx:%lx\n",
clean_loc >> bs->block_order, new_entry->oid.inode, new_entry->oid.stripe,
clean_loc >> bs->dsk.block_order, new_entry->oid.inode, new_entry->oid.stripe,
new_entry->version, cur.oid.inode, cur.oid.stripe);
exit(1);
}
// zero out new metadata entry
memset((uint8_t*)meta_new.buf + meta_new.pos*bs->clean_entry_size, 0, bs->clean_entry_size);
memset((uint8_t*)meta_new.buf + meta_new.pos*bs->dsk.clean_entry_size, 0, bs->dsk.clean_entry_size);
}
else
{
clean_disk_entry *new_entry = (clean_disk_entry*)((uint8_t*)meta_new.buf + meta_new.pos*bs->clean_entry_size);
clean_disk_entry *new_entry = (clean_disk_entry*)((uint8_t*)meta_new.buf + meta_new.pos*bs->dsk.clean_entry_size);
if (new_entry->oid.inode != 0 && new_entry->oid != cur.oid)
{
printf("Fatal error (metadata corruption or bug): tried to overwrite non-zero metadata entry %lu (%lx:%lx v%lu) with %lx:%lx v%lu\n",
clean_loc >> bs->block_order, new_entry->oid.inode, new_entry->oid.stripe, new_entry->version,
clean_loc >> bs->dsk.block_order, new_entry->oid.inode, new_entry->oid.stripe, new_entry->version,
cur.oid.inode, cur.oid.stripe, cur.version);
exit(1);
}
@ -572,20 +572,20 @@ resume_1:
new_entry->version = cur.version;
if (!bs->inmemory_meta)
{
memcpy(&new_entry->bitmap, new_clean_bitmap, bs->clean_entry_bitmap_size);
memcpy(&new_entry->bitmap, new_clean_bitmap, bs->dsk.clean_entry_bitmap_size);
}
// copy latest external bitmap/attributes
if (bs->clean_entry_bitmap_size)
if (bs->dsk.clean_entry_bitmap_size)
{
void *bmp_ptr = bs->clean_entry_bitmap_size > sizeof(void*) ? dirty_end->second.bitmap : &dirty_end->second.bitmap;
memcpy((uint8_t*)(new_entry+1) + bs->clean_entry_bitmap_size, bmp_ptr, bs->clean_entry_bitmap_size);
void *bmp_ptr = bs->dsk.clean_entry_bitmap_size > sizeof(void*) ? dirty_end->second.bitmap : &dirty_end->second.bitmap;
memcpy((uint8_t*)(new_entry+1) + bs->dsk.clean_entry_bitmap_size, bmp_ptr, bs->dsk.clean_entry_bitmap_size);
}
}
await_sqe(6);
data->iov = (struct iovec){ meta_new.buf, bs->meta_block_size };
data->iov = (struct iovec){ meta_new.buf, bs->dsk.meta_block_size };
data->callback = simple_callback_w;
my_uring_prep_writev(
sqe, bs->meta_fd, &data->iov, 1, bs->meta_offset + meta_new.sector
sqe, bs->dsk.meta_fd, &data->iov, 1, bs->dsk.meta_offset + meta_new.sector
);
wait_count++;
resume_7:
@ -669,9 +669,9 @@ resume_1:
.version = JOURNAL_VERSION,
};
((journal_entry_start*)flusher->journal_superblock)->crc32 = je_crc32((journal_entry*)flusher->journal_superblock);
data->iov = (struct iovec){ flusher->journal_superblock, bs->journal_block_size };
data->iov = (struct iovec){ flusher->journal_superblock, bs->dsk.journal_block_size };
data->callback = simple_callback_w;
my_uring_prep_writev(sqe, bs->journal.fd, &data->iov, 1, bs->journal.offset);
my_uring_prep_writev(sqe, bs->dsk.journal_fd, &data->iov, 1, bs->journal.offset);
wait_count++;
resume_13:
if (wait_count > 0)
@ -682,7 +682,7 @@ resume_1:
if (!bs->disable_journal_fsync)
{
await_sqe(20);
my_uring_prep_fsync(sqe, bs->journal.fd, IORING_FSYNC_DATASYNC);
my_uring_prep_fsync(sqe, bs->dsk.journal_fd, IORING_FSYNC_DATASYNC);
data->iov = { 0 };
data->callback = simple_callback_w;
resume_21:
@ -774,7 +774,7 @@ bool journal_flusher_co::scan_dirty(int wait_base)
data->iov = (struct iovec){ it->buf, (size_t)submit_len };
data->callback = simple_callback_r;
my_uring_prep_readv(
sqe, bs->journal.fd, &data->iov, 1, bs->journal.offset + submit_offset
sqe, bs->dsk.journal_fd, &data->iov, 1, bs->journal.offset + submit_offset
);
wait_count++;
}
@ -825,8 +825,8 @@ bool journal_flusher_co::modify_meta_read(uint64_t meta_loc, flusher_meta_write_
// And yet another option is to use LSM trees for metadata, but it sophisticates everything a lot,
// so I'll avoid it as long as I can.
wr.submitted = false;
wr.sector = ((meta_loc >> bs->block_order) / (bs->meta_block_size / bs->clean_entry_size)) * bs->meta_block_size;
wr.pos = ((meta_loc >> bs->block_order) % (bs->meta_block_size / bs->clean_entry_size));
wr.sector = ((meta_loc >> bs->dsk.block_order) / (bs->dsk.meta_block_size / bs->dsk.clean_entry_size)) * bs->dsk.meta_block_size;
wr.pos = ((meta_loc >> bs->dsk.block_order) % (bs->dsk.meta_block_size / bs->dsk.clean_entry_size));
if (bs->inmemory_meta)
{
wr.buf = (uint8_t*)bs->metadata_buffer + wr.sector;
@ -836,20 +836,20 @@ bool journal_flusher_co::modify_meta_read(uint64_t meta_loc, flusher_meta_write_
if (wr.it == flusher->meta_sectors.end())
{
// Not in memory yet, read it
wr.buf = memalign_or_die(MEM_ALIGNMENT, bs->meta_block_size);
wr.buf = memalign_or_die(MEM_ALIGNMENT, bs->dsk.meta_block_size);
wr.it = flusher->meta_sectors.emplace(wr.sector, (meta_sector_t){
.offset = wr.sector,
.len = bs->meta_block_size,
.len = bs->dsk.meta_block_size,
.state = 0, // 0 = not read yet
.buf = wr.buf,
.usage_count = 1,
}).first;
await_sqe(0);
data->iov = (struct iovec){ wr.it->second.buf, bs->meta_block_size };
data->iov = (struct iovec){ wr.it->second.buf, bs->dsk.meta_block_size };
data->callback = simple_callback_r;
wr.submitted = true;
my_uring_prep_readv(
sqe, bs->meta_fd, &data->iov, 1, bs->meta_offset + wr.sector
sqe, bs->dsk.meta_fd, &data->iov, 1, bs->dsk.meta_offset + wr.sector
);
wait_count++;
}
@ -867,11 +867,11 @@ void journal_flusher_co::update_clean_db()
{
#ifdef BLOCKSTORE_DEBUG
printf("Free block %lu from %lx:%lx v%lu (new location is %lu)\n",
old_clean_loc >> bs->block_order,
old_clean_loc >> bs->dsk.block_order,
cur.oid.inode, cur.oid.stripe, cur.version,
clean_loc >> bs->block_order);
clean_loc >> bs->dsk.block_order);
#endif
bs->data_alloc->set(old_clean_loc >> bs->block_order, false);
bs->data_alloc->set(old_clean_loc >> bs->dsk.block_order, false);
}
auto & clean_db = bs->clean_db_shard(cur.oid);
if (has_delete)
@ -880,10 +880,10 @@ void journal_flusher_co::update_clean_db()
clean_db.erase(clean_it);
#ifdef BLOCKSTORE_DEBUG
printf("Free block %lu from %lx:%lx v%lu (delete)\n",
clean_loc >> bs->block_order,
clean_loc >> bs->dsk.block_order,
cur.oid.inode, cur.oid.stripe, cur.version);
#endif
bs->data_alloc->set(clean_loc >> bs->block_order, false);
bs->data_alloc->set(clean_loc >> bs->dsk.block_order, false);
clean_loc = UINT64_MAX;
}
else
@ -932,7 +932,7 @@ bool journal_flusher_co::fsync_batch(bool fsync_meta, int wait_base)
await_sqe(0);
data->iov = { 0 };
data->callback = simple_callback_w;
my_uring_prep_fsync(sqe, fsync_meta ? bs->meta_fd : bs->data_fd, IORING_FSYNC_DATASYNC);
my_uring_prep_fsync(sqe, fsync_meta ? bs->dsk.meta_fd : bs->dsk.data_fd, IORING_FSYNC_DATASYNC);
cur_sync->state = 1;
wait_count++;
resume_2:

33
src/blockstore_impl.cpp

@ -11,25 +11,19 @@ blockstore_impl_t::blockstore_impl_t(blockstore_config_t & config, ring_loop_t *
ring_consumer.loop = [this]() { loop(); };
ringloop->register_consumer(&ring_consumer);
initialized = 0;
data_fd = meta_fd = journal.fd = -1;
parse_config(config);
zero_object = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, block_size);
zero_object = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, dsk.data_block_size);
try
{
open_data();
open_meta();
open_journal();
dsk.open_data();
dsk.open_meta();
dsk.open_journal();
calc_lengths();
data_alloc = new allocator(block_count);
data_alloc = new allocator(dsk.block_count);
}
catch (std::exception & e)
{
if (data_fd >= 0)
close(data_fd);
if (meta_fd >= 0 && meta_fd != data_fd)
close(meta_fd);
if (journal.fd >= 0 && journal.fd != meta_fd)
close(journal.fd);
dsk.close_all();
throw;
}
flusher = new journal_flusher_t(this);
@ -41,12 +35,7 @@ blockstore_impl_t::~blockstore_impl_t()
delete flusher;
free(zero_object);
ringloop->unregister_consumer(&ring_consumer);
if (data_fd >= 0)
close(data_fd);
if (meta_fd >= 0 && meta_fd != data_fd)
close(meta_fd);
if (journal.fd >= 0 && journal.fd != meta_fd)
close(journal.fd);
dsk.close_all();
if (metadata_buffer)
free(metadata_buffer);
if (clean_bitmap)
@ -343,9 +332,9 @@ void blockstore_impl_t::enqueue_op(blockstore_op_t *op)
{
if (op->opcode < BS_OP_MIN || op->opcode > BS_OP_MAX ||
((op->opcode == BS_OP_READ || op->opcode == BS_OP_WRITE || op->opcode == BS_OP_WRITE_STABLE) && (
op->offset >= block_size ||
op->len > block_size-op->offset ||
(op->len % disk_alignment)
op->offset >= dsk.data_block_size ||
op->len > dsk.data_block_size-op->offset ||
(op->len % dsk.disk_alignment)
)) ||
readonly && op->opcode != BS_OP_READ && op->opcode != BS_OP_LIST)
{
@ -477,7 +466,7 @@ void blockstore_impl_t::process_list(blockstore_op_t *op)
uint64_t min_inode = op->oid.inode;
uint64_t max_inode = op->version;
// Check PG
if (pg_count != 0 && (pg_stripe_size < MIN_BLOCK_SIZE || list_pg > pg_count))
if (pg_count != 0 && (pg_stripe_size < MIN_DATA_BLOCK_SIZE || list_pg > pg_count))
{
op->retval = -EINVAL;
FINISH_OP(op);

37
src/blockstore_impl.h

@ -4,6 +4,7 @@
#pragma once
#include "blockstore.h"
#include "blockstore_disk.h"
#include <sys/types.h>
#include <sys/ioctl.h>
@ -17,6 +18,7 @@
#include <list>
#include <deque>
#include <new>
#include <unordered_map>
#include "cpp-btree/btree_map.h"
@ -217,23 +219,10 @@ struct pool_shard_settings_t
class blockstore_impl_t
{
blockstore_disk_t dsk;
/******* OPTIONS *******/
std::string data_device, meta_device, journal_device;
uint32_t block_size;
uint64_t meta_offset;
uint64_t data_offset;
uint64_t cfg_journal_size, cfg_data_size;
// Required write alignment and journal/metadata/data areas' location alignment
uint32_t disk_alignment = 4096;
// Journal block size - minimum_io_size of the journal device is the best choice
uint64_t journal_block_size = 4096;
// Metadata block size - minimum_io_size of the metadata device is the best choice
uint64_t meta_block_size = 4096;
// Sparse write tracking granularity. 4 KB is a good choice. Must be a multiple of disk_alignment
uint64_t bitmap_granularity = 4096;
bool readonly = false;
// By default, Blockstore locks all opened devices exclusively. This option can be used to disable locking
bool disable_flock = false;
// It is safe to disable fsync() if drive write cache is writethrough
bool disable_data_fsync = false, disable_meta_fsync = false, disable_journal_fsync = false;
// Enable if you want every operation to be executed with an "implicit fsync"
@ -268,16 +257,6 @@ class blockstore_impl_t
allocator *data_alloc = NULL;
uint8_t *zero_object;
uint32_t block_order;
uint64_t block_count;
uint32_t clean_entry_bitmap_size = 0, clean_entry_size = 0;
int meta_fd;
int data_fd;
uint64_t meta_size, meta_area, meta_len;
uint64_t data_size, data_len;
uint64_t data_device_sect, meta_device_sect, journal_device_sect;
void *metadata_buffer = NULL;
struct journal_t journal;
@ -394,9 +373,9 @@ public:
// Print diagnostics to stdout
void dump_diagnostics();
inline uint32_t get_block_size() { return block_size; }
inline uint64_t get_block_count() { return block_count; }
inline uint32_t get_block_size() { return dsk.data_block_size; }
inline uint64_t get_block_count() { return dsk.block_count; }
inline uint64_t get_free_block_count() { return data_alloc->get_free_count(); }
inline uint32_t get_bitmap_granularity() { return disk_alignment; }
inline uint64_t get_journal_size() { return journal.len; }
inline uint32_t get_bitmap_granularity() { return dsk.disk_alignment; }
inline uint64_t get_journal_size() { return dsk.journal_len; }
};

101
src/blockstore_init.cpp

@ -57,9 +57,9 @@ int blockstore_init_meta::loop()
throw std::runtime_error("Failed to allocate metadata read buffer");
// Read superblock
GET_SQE();
data->iov = { metadata_buffer, bs->meta_block_size };
data->iov = { metadata_buffer, bs->dsk.meta_block_size };
data->callback = [this](ring_data_t *data) { handle_event(data); };
my_uring_prep_readv(sqe, bs->meta_fd, &data->iov, 1, bs->meta_offset);
my_uring_prep_readv(sqe, bs->dsk.meta_fd, &data->iov, 1, bs->dsk.meta_offset);
bs->ringloop->submit();
submitted = 1;
resume_1:
@ -68,16 +68,16 @@ resume_1:
wait_state = 1;
return 1;
}
if (iszero((uint64_t*)metadata_buffer, bs->meta_block_size / sizeof(uint64_t)))
if (iszero((uint64_t*)metadata_buffer, bs->dsk.meta_block_size / sizeof(uint64_t)))
{
{
blockstore_meta_header_v1_t *hdr = (blockstore_meta_header_v1_t *)metadata_buffer;
hdr->zero = 0;
hdr->magic = BLOCKSTORE_META_MAGIC_V1;
hdr->version = BLOCKSTORE_META_VERSION_V1;
hdr->meta_block_size = bs->meta_block_size;
hdr->data_block_size = bs->block_size;
hdr->bitmap_granularity = bs->bitmap_granularity;
hdr->meta_block_size = bs->dsk.meta_block_size;
hdr->data_block_size = bs->dsk.data_block_size;
hdr->bitmap_granularity = bs->dsk.bitmap_granularity;
}
if (bs->readonly)
{
@ -87,9 +87,9 @@ resume_1:
{
printf("Initializing metadata area\n");
GET_SQE();
data->iov = (struct iovec){ metadata_buffer, bs->meta_block_size };
data->iov = (struct iovec){ metadata_buffer, bs->dsk.meta_block_size };
data->callback = [this](ring_data_t *data) { handle_event(data); };
my_uring_prep_writev(sqe, bs->meta_fd, &data->iov, 1, bs->meta_offset);
my_uring_prep_writev(sqe, bs->dsk.meta_fd, &data->iov, 1, bs->dsk.meta_offset);
bs->ringloop->submit();
submitted = 1;
resume_3:
@ -115,27 +115,26 @@ resume_1:
);
exit(1);
}
if (hdr->meta_block_size != bs->meta_block_size ||
hdr->data_block_size != bs->block_size ||
hdr->bitmap_granularity != bs->bitmap_granularity)
if (hdr->meta_block_size != bs->dsk.meta_block_size ||
hdr->data_block_size != bs->dsk.data_block_size ||
hdr->bitmap_granularity != bs->dsk.bitmap_granularity)
{
printf(
"Configuration stored in metadata superblock"
" (meta_block_size=%u, data_block_size=%u, bitmap_granularity=%u)"
" differs from OSD configuration (%lu/%u/%lu).\n",
hdr->meta_block_size, hdr->data_block_size, hdr->bitmap_granularity,
bs->meta_block_size, bs->block_size, bs->bitmap_granularity
bs->dsk.meta_block_size, bs->dsk.data_block_size, bs->dsk.bitmap_granularity
);
exit(1);
}
}
// Skip superblock
bs->meta_offset += bs->meta_block_size;
bs->meta_len -= bs->meta_block_size;
md_offset = bs->dsk.meta_block_size;
metadata_read = bs->dsk.meta_block_size;
prev_done = 0;
done_len = 0;
done_pos = 0;
metadata_read = 0;
// Read the rest of the metadata
while (1)
{
@ -145,23 +144,23 @@ resume_1:
wait_state = 2;
return 1;
}
if (metadata_read < bs->meta_len)
if (metadata_read < bs->dsk.meta_len)
{
GET_SQE();
data->iov = {
(uint8_t*)metadata_buffer + (bs->inmemory_meta
? metadata_read
? metadata_read-md_offset
: (prev == 1 ? bs->metadata_buf_size : 0)),
bs->meta_len - metadata_read > bs->metadata_buf_size ? bs->metadata_buf_size : bs->meta_len - metadata_read,
bs->dsk.meta_len - metadata_read > bs->metadata_buf_size ? bs->metadata_buf_size : bs->dsk.meta_len - metadata_read,
};
data->callback = [this](ring_data_t *data) { handle_event(data); };
if (!zero_on_init)
my_uring_prep_readv(sqe, bs->meta_fd, &data->iov, 1, bs->meta_offset + metadata_read);
my_uring_prep_readv(sqe, bs->dsk.meta_fd, &data->iov, 1, bs->dsk.meta_offset + metadata_read);
else
{
// Fill metadata with zeroes
memset(data->iov.iov_base, 0, data->iov.iov_len);
my_uring_prep_writev(sqe, bs->meta_fd, &data->iov, 1, bs->meta_offset + metadata_read);
my_uring_prep_writev(sqe, bs->dsk.meta_fd, &data->iov, 1, bs->dsk.meta_offset + metadata_read);
}
bs->ringloop->submit();
submitted = (prev == 1 ? 2 : 1);
@ -170,13 +169,13 @@ resume_1:
if (prev_done)
{
void *done_buf = bs->inmemory_meta
? ((uint8_t*)metadata_buffer + done_pos)
? ((uint8_t*)metadata_buffer + done_pos-md_offset)
: ((uint8_t*)metadata_buffer + (prev_done == 2 ? bs->metadata_buf_size : 0));
unsigned count = bs->meta_block_size / bs->clean_entry_size;
for (int sector = 0; sector < done_len; sector += bs->meta_block_size)
unsigned count = bs->dsk.meta_block_size / bs->dsk.clean_entry_size;
for (int sector = 0; sector < done_len; sector += bs->dsk.meta_block_size)
{
// handle <count> entries
handle_entries((uint8_t*)done_buf + sector, count, bs->block_order);
handle_entries((uint8_t*)done_buf + sector, count, bs->dsk.block_order);
done_cnt += count;
}
prev_done = 0;
@ -188,7 +187,7 @@ resume_1:
}
}
// metadata read finished
printf("Metadata entries loaded: %lu, free blocks: %lu / %lu\n", entries_loaded, bs->data_alloc->get_free_count(), bs->block_count);
printf("Metadata entries loaded: %lu, free blocks: %lu / %lu\n", entries_loaded, bs->data_alloc->get_free_count(), bs->dsk.block_count);
if (!bs->inmemory_meta)
{
free(metadata_buffer);
@ -197,7 +196,7 @@ resume_1:
if (zero_on_init && !bs->disable_meta_fsync)
{
GET_SQE();
my_uring_prep_fsync(sqe, bs->meta_fd, IORING_FSYNC_DATASYNC);
my_uring_prep_fsync(sqe, bs->dsk.meta_fd, IORING_FSYNC_DATASYNC);
data->iov = { 0 };
data->callback = [this](ring_data_t *data) { handle_event(data); };
submitted = 1;
@ -216,10 +215,10 @@ void blockstore_init_meta::handle_entries(void* entries, unsigned count, int blo
{
for (unsigned i = 0; i < count; i++)
{
clean_disk_entry *entry = (clean_disk_entry*)((uint8_t*)entries + i*bs->clean_entry_size);
if (!bs->inmemory_meta && bs->clean_entry_bitmap_size)
clean_disk_entry *entry = (clean_disk_entry*)((uint8_t*)entries + i*bs->dsk.clean_entry_size);
if (!bs->inmemory_meta && bs->dsk.clean_entry_bitmap_size)
{
memcpy(bs->clean_bitmap + (done_cnt+i)*2*bs->clean_entry_bitmap_size, &entry->bitmap, 2*bs->clean_entry_bitmap_size);
memcpy(bs->clean_bitmap + (done_cnt+i)*2*bs->dsk.clean_entry_bitmap_size, &entry->bitmap, 2*bs->dsk.clean_entry_bitmap_size);
}
if (entry->oid.inode > 0)
{
@ -240,7 +239,7 @@ void blockstore_init_meta::handle_entries(void* entries, unsigned count, int blo
}
else
{
bs->inode_space_stats[entry->oid.inode] += bs->block_size;
bs->inode_space_stats[entry->oid.inode] += bs->dsk.data_block_size;
}
entries_loaded++;
#ifdef BLOCKSTORE_DEBUG
@ -328,7 +327,7 @@ int blockstore_init_journal::loop()
data = ((ring_data_t*)sqe->user_data);
data->iov = { submitted_buf, bs->journal.block_size };
data->callback = simple_callback;
my_uring_prep_readv(sqe, bs->journal.fd, &data->iov, 1, bs->journal.offset);
my_uring_prep_readv(sqe, bs->dsk.journal_fd, &data->iov, 1, bs->journal.offset);
bs->ringloop->submit();
wait_count = 1;
resume_1:
@ -367,7 +366,7 @@ resume_1:
GET_SQE();
data->iov = (struct iovec){ submitted_buf, 2*bs->journal.block_size };
data->callback = simple_callback;
my_uring_prep_writev(sqe, bs->journal.fd, &data->iov, 1, bs->journal.offset);
my_uring_prep_writev(sqe, bs->dsk.journal_fd, &data->iov, 1, bs->journal.offset);
wait_count++;
bs->ringloop->submit();
resume_6:
@ -379,7 +378,7 @@ resume_1:
if (!bs->disable_journal_fsync)
{
GET_SQE();
my_uring_prep_fsync(sqe, bs->journal.fd, IORING_FSYNC_DATASYNC);
my_uring_prep_fsync(sqe, bs->dsk.journal_fd, IORING_FSYNC_DATASYNC);
data->iov = { 0 };
data->callback = simple_callback;
wait_count++;
@ -448,7 +447,7 @@ resume_1:
end - journal_pos < JOURNAL_BUFFER_SIZE ? end - journal_pos : JOURNAL_BUFFER_SIZE,
};
data->callback = [this](ring_data_t *data1) { handle_event(data1); };
my_uring_prep_readv(sqe, bs->journal.fd, &data->iov, 1, bs->journal.offset + journal_pos);
my_uring_prep_readv(sqe, bs->dsk.journal_fd, &data->iov, 1, bs->journal.offset + journal_pos);
bs->ringloop->submit();
}
while (done.size() > 0)
@ -463,7 +462,7 @@ resume_1:
GET_SQE();
data->iov = { init_write_buf, bs->journal.block_size };
data->callback = simple_callback;
my_uring_prep_writev(sqe, bs->journal.fd, &data->iov, 1, bs->journal.offset + init_write_sector);
my_uring_prep_writev(sqe, bs->dsk.journal_fd, &data->iov, 1, bs->journal.offset + init_write_sector);
wait_count++;
bs->ringloop->submit();
resume_7:
@ -477,7 +476,7 @@ resume_1:
GET_SQE();
data->iov = { 0 };
data->callback = simple_callback;
my_uring_prep_fsync(sqe, bs->journal.fd, IORING_FSYNC_DATASYNC);
my_uring_prep_fsync(sqe, bs->dsk.journal_fd, IORING_FSYNC_DATASYNC);
wait_count++;
bs->ringloop->submit();
}
@ -544,7 +543,7 @@ resume_1:
? bs->journal.len-bs->journal.block_size - (bs->journal.next_free-bs->journal.used_start)
: bs->journal.used_start - bs->journal.next_free),
bs->journal.used_start, bs->journal.next_free,
bs->data_alloc->get_free_count(), bs->block_count
bs->data_alloc->get_free_count(), bs->dsk.block_count
);
bs->journal.crc32_last = crc32_last;
return 0;
@ -669,9 +668,9 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
};
void *bmp = NULL;
void *bmp_from = (uint8_t*)je + sizeof(journal_entry_small_write);
if (bs->clean_entry_bitmap_size <= sizeof(void*))
if (bs->dsk.clean_entry_bitmap_size <= sizeof(void*))
{
memcpy(&bmp, bmp_from, bs->clean_entry_bitmap_size);
memcpy(&bmp, bmp_from, bs->dsk.clean_entry_bitmap_size);
}
else
{
@ -679,8 +678,8 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
// allocations for entry bitmaps. This can only be fixed by using
// a patched map with dynamic entry size, but not the btree_map,
// because it doesn't keep iterators valid all the time.
bmp = malloc_or_die(bs->clean_entry_bitmap_size);
memcpy(bmp, bmp_from, bs->clean_entry_bitmap_size);
bmp = malloc_or_die(bs->dsk.clean_entry_bitmap_size);
memcpy(bmp, bmp_from, bs->dsk.clean_entry_bitmap_size);
}
bs->dirty_db.emplace(ov, (dirty_entry){
.state = (BS_ST_SMALL_WRITE | BS_ST_SYNCED),
@ -712,7 +711,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
printf(
"je_big_write%s oid=%lx:%lx ver=%lu loc=%lu\n",
je->type == JE_BIG_WRITE_INSTANT ? "_instant" : "",
je->big_write.oid.inode, je->big_write.oid.stripe, je->big_write.version, je->big_write.location >> bs->block_order
je->big_write.oid.inode, je->big_write.oid.stripe, je->big_write.version, je->big_write.location >> bs->dsk.block_order
);
#endif
auto dirty_it = bs->dirty_db.upper_bound((obj_ver_id){
@ -750,9 +749,9 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
};
void *bmp = NULL;
void *bmp_from = (uint8_t*)je + sizeof(journal_entry_big_write);
if (bs->clean_entry_bitmap_size <= sizeof(void*))
if (bs->dsk.clean_entry_bitmap_size <= sizeof(void*))
{
memcpy(&bmp, bmp_from, bs->clean_entry_bitmap_size);
memcpy(&bmp, bmp_from, bs->dsk.clean_entry_bitmap_size);
}
else
{
@ -760,8 +759,8 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
// allocations for entry bitmaps. This can only be fixed by using
// a patched map with dynamic entry size, but not the btree_map,
// because it doesn't keep iterators valid all the time.
bmp = malloc_or_die(bs->clean_entry_bitmap_size);
memcpy(bmp, bmp_from, bs->clean_entry_bitmap_size);
bmp = malloc_or_die(bs->dsk.clean_entry_bitmap_size);
memcpy(bmp, bmp_from, bs->dsk.clean_entry_bitmap_size);
}
auto dirty_it = bs->dirty_db.emplace(ov, (dirty_entry){
.state = (BS_ST_BIG_WRITE | BS_ST_SYNCED),
@ -772,7 +771,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
.journal_sector = proc_pos,
.bitmap = bmp,
}).first;
if (bs->data_alloc->get(je->big_write.location >> bs->block_order))
if (bs->data_alloc->get(je->big_write.location >> bs->dsk.block_order))
{
// This is probably a big_write that's already flushed and freed, but it may
// also indicate a bug. So we remember such entries and recheck them afterwards.
@ -785,11 +784,11 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
#ifdef BLOCKSTORE_DEBUG
printf(
"Allocate block (journal) %lu: %lx:%lx v%lu\n",
je->big_write.location >> bs->block_order,
je->big_write.location >> bs->dsk.block_order,
ov.oid.inode, ov.oid.stripe, ov.version
);
#endif
bs->data_alloc->set(je->big_write.location >> bs->block_order, true);
bs->data_alloc->set(je->big_write.location >> bs->dsk.block_order, true);
}
bs->journal.used_sectors[proc_pos]++;
#ifdef BLOCKSTORE_DEBUG
@ -913,8 +912,8 @@ void blockstore_init_journal::erase_dirty_object(blockstore_dirty_db_t::iterator
if (exists && clean_loc == UINT64_MAX)
{
auto & sp = bs->inode_space_stats[oid.inode];
if (sp > bs->block_size)
sp -= bs->block_size;
if (sp > bs->dsk.data_block_size)
sp -= bs->dsk.data_block_size;
else
bs->inode_space_stats.erase(oid.inode);
}

2
src/blockstore_init.h

@ -9,7 +9,7 @@ class blockstore_init_meta
int wait_state = 0;
bool zero_on_init = false;
void *metadata_buffer = NULL;
uint64_t metadata_read = 0;
uint64_t metadata_read = 0, md_offset = 0;
int prev = 0, prev_done = 0, done_len = 0, submitted = 0;
uint64_t done_cnt = 0, done_pos = 0;
uint64_t entries_loaded = 0;

2
src/blockstore_journal.cpp

@ -175,7 +175,7 @@ void blockstore_impl_t::prepare_journal_sector_write(int cur_sector, blockstore_
};
data->callback = [this, flush_id = journal.submit_id](ring_data_t *data) { handle_journal_write(data, flush_id); };
my_uring_prep_writev(
sqe, journal.fd, &data->iov, 1, journal.offset + journal.sector_info[cur_sector].offset
sqe, dsk.journal_fd, &data->iov, 1, journal.offset + journal.sector_info[cur_sector].offset
);
}
journal.sector_info[cur_sector].dirty = false;

1
src/blockstore_journal.h

@ -164,7 +164,6 @@ inline bool operator < (const pending_journaling_t & a, const pending_journaling
struct journal_t
{
int fd;
uint64_t device_size;
bool inmemory = false;
bool flush_journal = false;
void *buffer = NULL;

322
src/blockstore_open.cpp

@ -4,23 +4,10 @@
#include <sys/file.h>
#include "blockstore_impl.h"
static uint32_t is_power_of_two(uint64_t value)
{
uint32_t l = 0;
while (value > 1)
{
if (value & 1)
{
return 64;
}
value = value >> 1;
l++;
}
return l;
}