Split disk_tool.cpp into separate files

Vitaliy Filippov 2022-08-14 02:36:57 +03:00
parent 1170319431
commit b1e39b5dea
9 changed files with 2257 additions and 2213 deletions

View File

@ -194,7 +194,9 @@ configure_file(vitastor.pc.in vitastor.pc @ONLY)
# vitastor-disk
add_executable(vitastor-disk
disk_tool.cpp disk_simple_offsets.cpp crc32c.c str_util.cpp ../json11/json11.cpp rw_blocking.cpp allocator.cpp ringloop.cpp blockstore_disk.cpp
disk_tool.cpp disk_simple_offsets.cpp
disk_tool_journal.cpp disk_tool_meta.cpp disk_tool_prepare.cpp disk_tool_resize.cpp disk_tool_udev.cpp disk_tool_utils.cpp
crc32c.c str_util.cpp ../json11/json11.cpp rw_blocking.cpp allocator.cpp ringloop.cpp blockstore_disk.cpp
)
target_link_libraries(vitastor-disk
tcmalloc_minimal

File diff suppressed because it is too large Load Diff

136
src/disk_tool.h Normal file
View File

@ -0,0 +1,136 @@
// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 (see README.md for details)
#pragma once
#ifndef _LARGEFILE64_SOURCE
#define _LARGEFILE64_SOURCE 1
#endif
#include <map>
#include <vector>
#include <string>
#include <functional>
#include "json11/json11.hpp"
#include "blockstore_disk.h"
#include "blockstore_impl.h"
#include "crc32c.h"
// vITADisk
#define VITASTOR_DISK_MAGIC 0x6b73694441544976
#define VITASTOR_DISK_MAX_SB_SIZE 128*1024
#define VITASTOR_PART_TYPE "e7009fac-a5a1-4d72-af72-53de13059903"
#define DEFAULT_HYBRID_JOURNAL "1G"
struct resizer_data_moving_t;
struct vitastor_dev_info_t
{
std::string path;
bool is_hdd;
json11::Json pt; // pt = partition table
int osd_part_count;
uint64_t size;
uint64_t free;
};
struct disk_tool_t
{
/**** Parameters ****/
std::map<std::string, std::string> options;
bool all, json, now;
blockstore_disk_t dsk;
// resize data and/or move metadata and journal
int iodepth;
std::string new_meta_device, new_journal_device;
uint64_t new_data_offset, new_data_len;
uint64_t new_journal_offset, new_journal_len;
uint64_t new_meta_offset, new_meta_len;
/**** State ****/
uint64_t meta_pos;
uint64_t journal_pos, journal_calc_data_pos;
bool first, first2;
allocator *data_alloc;
std::map<uint64_t, uint64_t> data_remap;
std::map<uint64_t, uint64_t>::iterator remap_it;
ring_loop_t *ringloop;
ring_consumer_t ring_consumer;
int remap_active;
uint8_t *new_journal_buf, *new_meta_buf, *new_journal_ptr, *new_journal_data;
uint64_t new_journal_in_pos;
int64_t data_idx_diff;
uint64_t total_blocks, free_first, free_last;
uint64_t new_clean_entry_bitmap_size, new_clean_entry_size, new_entries_per_block;
int new_journal_fd, new_meta_fd;
resizer_data_moving_t *moving_blocks;
bool started;
void *small_write_data;
uint32_t data_crc32;
uint32_t crc32_last;
uint32_t new_crc32_prev;
/**** Commands ****/
int dump_journal();
int dump_meta();
int resize_data();
/**** Methods ****/
~disk_tool_t();
void dump_journal_entry(int num, journal_entry *je, bool json);
int process_journal(std::function<int(void*)> block_fn);
int process_journal_block(void *buf, std::function<void(int, journal_entry*)> iter_fn);
int process_meta(std::function<void(blockstore_meta_header_v1_t *)> hdr_fn,
std::function<void(uint64_t, clean_disk_entry*, uint8_t*)> record_fn);
void dump_meta_header(blockstore_meta_header_v1_t *hdr);
void dump_meta_entry(uint64_t block_num, clean_disk_entry *entry, uint8_t *bitmap);
int resize_parse_params();
void resize_init(blockstore_meta_header_v1_t *hdr);
int resize_remap_blocks();
int resize_copy_data();
int resize_rewrite_journal();
int resize_write_new_journal();
int resize_rewrite_meta();
int resize_write_new_meta();
int udev_import(std::string device);
int read_sb(std::string device);
int write_sb(std::string device);
int exec_osd(std::string device);
int systemd_start_stop_osds(std::vector<std::string> cmd, std::vector<std::string> devices);
int pre_exec_osd(std::string device);
json11::Json read_osd_superblock(std::string device, bool expect_exist = true);
uint32_t write_osd_superblock(std::string device, json11::Json params);
int prepare_one(std::map<std::string, std::string> options, int is_hdd = -1);
int prepare(std::vector<std::string> devices);
std::vector<vitastor_dev_info_t> collect_devices(const std::vector<std::string> & devices);
json11::Json add_partitions(vitastor_dev_info_t & devinfo, std::vector<std::string> sizes);
std::vector<std::string> get_new_data_parts(vitastor_dev_info_t & dev, uint64_t osd_per_disk, uint64_t max_other_percent);
int get_meta_partition(std::vector<vitastor_dev_info_t> & ssds, std::map<std::string, std::string> & options);
};
void disk_tool_simple_offsets(json11::Json cfg, bool json_output);
std::string realpath_str(std::string path, bool nofail = true);
std::string read_all_fd(int fd);
std::string read_file(std::string file);
int check_queue_cache(std::string dev, std::string parent_dev);
std::string get_parent_device(std::string dev);
bool json_is_true(const json11::Json & val);
int shell_exec(const std::vector<std::string> & cmd, const std::string & in, std::string *out, std::string *err);
int write_zero(int fd, uint64_t offset, uint64_t size);
json11::Json read_parttable(std::string dev);
uint64_t free_from_parttable(json11::Json pt);

284
src/disk_tool_journal.cpp Normal file
View File

@ -0,0 +1,284 @@
// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 (see README.md for details)
#include "disk_tool.h"
int disk_tool_t::dump_journal()
{
if (dsk.journal_block_size < DIRECT_IO_ALIGNMENT || (dsk.journal_block_size % DIRECT_IO_ALIGNMENT) ||
dsk.journal_block_size > 128*1024)
{
fprintf(stderr, "Invalid journal block size\n");
return 1;
}
first = true;
if (json)
printf("[\n");
if (all)
{
dsk.journal_fd = open(dsk.journal_device.c_str(), O_DIRECT|O_RDONLY);
if (dsk.journal_fd < 0)
{
fprintf(stderr, "Failed to open journal device %s: %s\n", dsk.journal_device.c_str(), strerror(errno));
return 1;
}
void *journal_buf = memalign_or_die(MEM_ALIGNMENT, dsk.journal_block_size);
journal_pos = 0;
while (journal_pos < dsk.journal_len)
{
int r = pread(dsk.journal_fd, journal_buf, dsk.journal_block_size, dsk.journal_offset+journal_pos);
assert(r == dsk.journal_block_size);
uint64_t s;
for (s = 0; s < dsk.journal_block_size; s += 8)
{
if (*((uint64_t*)((uint8_t*)journal_buf+s)) != 0)
break;
}
if (json)
{
printf("%s{\"offset\":\"0x%lx\"", first ? "" : ",\n", journal_pos);
first = false;
}
if (s == dsk.journal_block_size)
{
if (json)
printf(",\"type\":\"zero\"}");
else
printf("offset %08lx: zeroes\n", journal_pos);
journal_pos += dsk.journal_block_size;
}
else if (((journal_entry*)journal_buf)->magic == JOURNAL_MAGIC)
{
if (!json)
printf("offset %08lx:\n", journal_pos);
else
printf(",\"entries\":[\n");
first2 = true;
process_journal_block(journal_buf, [this](int num, journal_entry *je) { dump_journal_entry(num, je, json); });
if (json)
printf(first2 ? "]}" : "\n]}");
}
else
{
if (json)
printf(",\"type\":\"data\",\"pattern\":\"%08lx\"}", *((uint64_t*)journal_buf));
else
printf("offset %08lx: no magic in the beginning, looks like random data (pattern=%08lx)\n", journal_pos, *((uint64_t*)journal_buf));
journal_pos += dsk.journal_block_size;
}
}
free(journal_buf);
close(dsk.journal_fd);
dsk.journal_fd = -1;
}
else
{
process_journal([this](void *data)
{
first2 = true;
if (!json)
printf("offset %08lx:\n", journal_pos);
auto pos = journal_pos;
int r = process_journal_block(data, [this, pos](int num, journal_entry *je)
{
if (json && first2)
{
printf("%s{\"offset\":\"0x%lx\",\"entries\":[\n", first ? "" : ",\n", pos);
first = false;
}
dump_journal_entry(num, je, json);
});
if (json)
printf(first2 ? "" : "\n]}");
else if (r <= 0)
printf("end of the journal\n");
return r;
});
}
if (json)
printf(first ? "]\n" : "\n]\n");
return 0;
}
int disk_tool_t::process_journal(std::function<int(void*)> block_fn)
{
dsk.journal_fd = open(dsk.journal_device.c_str(), O_DIRECT|O_RDONLY);
if (dsk.journal_fd < 0)
{
fprintf(stderr, "Failed to open journal device %s: %s\n", dsk.journal_device.c_str(), strerror(errno));
return 1;
}
void *data = memalign_or_die(MEM_ALIGNMENT, dsk.journal_block_size);
journal_pos = 0;
int r = pread(dsk.journal_fd, data, dsk.journal_block_size, dsk.journal_offset+journal_pos);
assert(r == dsk.journal_block_size);
journal_entry *je = (journal_entry*)(data);
if (je->magic != JOURNAL_MAGIC || je->type != JE_START || je_crc32(je) != je->crc32)
{
fprintf(stderr, "offset %08lx: journal superblock is invalid\n", journal_pos);
r = 1;
}
else
{
started = false;
crc32_last = 0;
block_fn(data);
started = false;
crc32_last = 0;
journal_pos = je->start.journal_start;
while (1)
{
if (journal_pos >= dsk.journal_len)
journal_pos = dsk.journal_block_size;
r = pread(dsk.journal_fd, data, dsk.journal_block_size, dsk.journal_offset+journal_pos);
assert(r == dsk.journal_block_size);
r = block_fn(data);
if (r <= 0)
break;
}
}
close(dsk.journal_fd);
dsk.journal_fd = -1;
free(data);
return r;
}
int disk_tool_t::process_journal_block(void *buf, std::function<void(int, journal_entry*)> iter_fn)
{
uint32_t pos = 0;
journal_pos += dsk.journal_block_size;
int entry = 0;
bool wrapped = false;
while (pos <= dsk.journal_block_size-JOURNAL_ENTRY_HEADER_SIZE)
{
journal_entry *je = (journal_entry*)((uint8_t*)buf + pos);
if (je->magic != JOURNAL_MAGIC || je->type < JE_MIN || je->type > JE_MAX ||
!all && started && je->crc32_prev != crc32_last || pos > dsk.journal_block_size-je->size)
{
break;
}
bool crc32_valid = je_crc32(je) == je->crc32;
if (!all && !crc32_valid)
{
break;
}
started = true;
crc32_last = je->crc32;
if (je->type == JE_SMALL_WRITE || je->type == JE_SMALL_WRITE_INSTANT)
{
journal_calc_data_pos = journal_pos;
if (journal_pos + je->small_write.len > dsk.journal_len)
{
// data continues from the beginning of the journal
journal_calc_data_pos = journal_pos = dsk.journal_block_size;
wrapped = true;
}
journal_pos += je->small_write.len;
if (journal_pos >= dsk.journal_len)
{
journal_pos = dsk.journal_block_size;
wrapped = true;
}
small_write_data = memalign_or_die(MEM_ALIGNMENT, je->small_write.len);
assert(pread(dsk.journal_fd, small_write_data, je->small_write.len, dsk.journal_offset+je->small_write.data_offset) == je->small_write.len);
data_crc32 = crc32c(0, small_write_data, je->small_write.len);
}
iter_fn(entry, je);
if (je->type == JE_SMALL_WRITE || je->type == JE_SMALL_WRITE_INSTANT)
{
free(small_write_data);
small_write_data = NULL;
}
pos += je->size;
entry++;
}
if (wrapped)
{
journal_pos = dsk.journal_len;
}
return entry;
}
void disk_tool_t::dump_journal_entry(int num, journal_entry *je, bool json)
{
if (json)
{
if (!first2)
printf(",\n");
first2 = false;
printf(
"{\"crc32\":\"%08x\",\"valid\":%s,\"crc32_prev\":\"%08x\"",
je->crc32, (je_crc32(je) == je->crc32 ? "true" : "false"), je->crc32_prev
);
}
else
{
printf(
"entry % 3d: crc32=%08x %s prev=%08x ",
num, je->crc32, (je_crc32(je) == je->crc32 ? "(valid)" : "(invalid)"), je->crc32_prev
);
}
if (je->type == JE_START)
{
printf(
json ? ",\"type\":\"start\",\"start\":\"0x%lx\"}" : "je_start start=%08lx\n",
je->start.journal_start
);
}
else if (je->type == JE_SMALL_WRITE || je->type == JE_SMALL_WRITE_INSTANT)
{
printf(
json ? ",\"type\":\"small_write%s\",\"inode\":\"0x%lx\",\"stripe\":\"0x%lx\",\"ver\":\"%lu\",\"offset\":%u,\"len\":%u,\"loc\":\"0x%lx\""
: "je_small_write%s oid=%lx:%lx ver=%lu offset=%u len=%u loc=%08lx",
je->type == JE_SMALL_WRITE_INSTANT ? "_instant" : "",
je->small_write.oid.inode, je->small_write.oid.stripe,
je->small_write.version, je->small_write.offset, je->small_write.len,
je->small_write.data_offset
);
if (journal_calc_data_pos != je->small_write.data_offset)
{
printf(json ? ",\"bad_loc\":true,\"calc_loc\":\"0x%lx\""
: " (mismatched, calculated = %lu)", journal_pos);
}
printf(
json ? ",\"data_crc32\":\"%08x\",\"data_valid\":%s}" : " data_crc32=%08x%s\n",
je->small_write.crc32_data,
(data_crc32 != je->small_write.crc32_data
? (json ? "false" : " (invalid)")
: (json ? "true" : " (valid)"))
);
}
else if (je->type == JE_BIG_WRITE || je->type == JE_BIG_WRITE_INSTANT)
{
printf(
json ? ",\"type\":\"big_write%s\",\"inode\":\"0x%lx\",\"stripe\":\"0x%lx\",\"ver\":\"%lu\",\"loc\":\"0x%lx\"}"
: "je_big_write%s oid=%lx:%lx ver=%lu loc=%08lx\n",
je->type == JE_BIG_WRITE_INSTANT ? "_instant" : "",
je->big_write.oid.inode, je->big_write.oid.stripe, je->big_write.version, je->big_write.location
);
}
else if (je->type == JE_STABLE)
{
printf(
json ? ",\"type\":\"stable\",\"inode\":\"0x%lx\",\"stripe\":\"0x%lx\",\"ver\":\"%lu\"}"
: "je_stable oid=%lx:%lx ver=%lu\n",
je->stable.oid.inode, je->stable.oid.stripe, je->stable.version
);
}
else if (je->type == JE_ROLLBACK)
{
printf(
json ? ",\"type\":\"rollback\",\"inode\":\"0x%lx\",\"stripe\":\"0x%lx\",\"ver\":\"%lu\"}"
: "je_rollback oid=%lx:%lx ver=%lu\n",
je->rollback.oid.inode, je->rollback.oid.stripe, je->rollback.version
);
}
else if (je->type == JE_DELETE)
{
printf(
json ? ",\"type\":\"delete\",\"inode\":\"0x%lx\",\"stripe\":\"0x%lx\",\"ver\":\"%lu\"}"
: "je_delete oid=%lx:%lx ver=%lu\n",
je->del.oid.inode, je->del.oid.stripe, je->del.version
);
}
}

158
src/disk_tool_meta.cpp Normal file
View File

@ -0,0 +1,158 @@
// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 (see README.md for details)
#include "disk_tool.h"
#include "rw_blocking.h"
#include "osd_id.h"
int disk_tool_t::process_meta(std::function<void(blockstore_meta_header_v1_t *)> hdr_fn,
std::function<void(uint64_t, clean_disk_entry*, uint8_t*)> record_fn)
{
if (dsk.meta_block_size % DIRECT_IO_ALIGNMENT)
{
fprintf(stderr, "Invalid metadata block size: is not a multiple of %d\n", DIRECT_IO_ALIGNMENT);
return 1;
}
dsk.meta_fd = open(dsk.meta_device.c_str(), O_DIRECT|O_RDONLY);
if (dsk.meta_fd < 0)
{
fprintf(stderr, "Failed to open metadata device %s: %s\n", dsk.meta_device.c_str(), strerror(errno));
return 1;
}
int buf_size = 1024*1024;
if (buf_size % dsk.meta_block_size)
buf_size = 8*dsk.meta_block_size;
if (buf_size > dsk.meta_len)
buf_size = dsk.meta_len;
void *data = memalign_or_die(MEM_ALIGNMENT, buf_size);
lseek64(dsk.meta_fd, dsk.meta_offset, 0);
read_blocking(dsk.meta_fd, data, buf_size);
// Check superblock
blockstore_meta_header_v1_t *hdr = (blockstore_meta_header_v1_t *)data;
if (hdr->zero == 0 &&
hdr->magic == BLOCKSTORE_META_MAGIC_V1 &&
hdr->version == BLOCKSTORE_META_VERSION_V1)
{
// Vitastor 0.6-0.7 - static array of clean_disk_entry with bitmaps
if (hdr->meta_block_size != dsk.meta_block_size)
{
fprintf(stderr, "Using block size of %u bytes based on information from the superblock\n", hdr->meta_block_size);
dsk.meta_block_size = hdr->meta_block_size;
if (buf_size % dsk.meta_block_size)
{
buf_size = 8*dsk.meta_block_size;
free(data);
data = memalign_or_die(MEM_ALIGNMENT, buf_size);
}
}
dsk.bitmap_granularity = hdr->bitmap_granularity;
dsk.clean_entry_bitmap_size = hdr->data_block_size / hdr->bitmap_granularity / 8;
dsk.clean_entry_size = sizeof(clean_disk_entry) + 2*dsk.clean_entry_bitmap_size;
uint64_t block_num = 0;
hdr_fn(hdr);
meta_pos = dsk.meta_block_size;
lseek64(dsk.meta_fd, dsk.meta_offset+meta_pos, 0);
while (meta_pos < dsk.meta_len)
{
uint64_t read_len = buf_size < dsk.meta_len-meta_pos ? buf_size : dsk.meta_len-meta_pos;
read_blocking(dsk.meta_fd, data, read_len);
meta_pos += read_len;
for (uint64_t blk = 0; blk < read_len; blk += dsk.meta_block_size)
{
for (uint64_t ioff = 0; ioff <= dsk.meta_block_size-dsk.clean_entry_size; ioff += dsk.clean_entry_size, block_num++)
{
clean_disk_entry *entry = (clean_disk_entry*)((uint8_t*)data + blk + ioff);
if (entry->oid.inode)
{
record_fn(block_num, entry, entry->bitmap);
}
}
}
}
}
else
{
// Vitastor 0.4-0.5 - static array of clean_disk_entry
dsk.clean_entry_bitmap_size = 0;
dsk.clean_entry_size = sizeof(clean_disk_entry);
uint64_t block_num = 0;
hdr_fn(NULL);
while (meta_pos < dsk.meta_len)
{
uint64_t read_len = buf_size < dsk.meta_len-meta_pos ? buf_size : dsk.meta_len-meta_pos;
read_blocking(dsk.meta_fd, data, read_len);
meta_pos += read_len;
for (uint64_t blk = 0; blk < read_len; blk += dsk.meta_block_size)
{
for (uint64_t ioff = 0; ioff < dsk.meta_block_size-dsk.clean_entry_size; ioff += dsk.clean_entry_size, block_num++)
{
clean_disk_entry *entry = (clean_disk_entry*)((uint8_t*)data + blk + ioff);
if (entry->oid.inode)
{
record_fn(block_num, entry, NULL);
}
}
}
}
}
free(data);
close(dsk.meta_fd);
dsk.meta_fd = -1;
return 0;
}
int disk_tool_t::dump_meta()
{
int r = process_meta(
[this](blockstore_meta_header_v1_t *hdr) { dump_meta_header(hdr); },
[this](uint64_t block_num, clean_disk_entry *entry, uint8_t *bitmap) { dump_meta_entry(block_num, entry, bitmap); }
);
printf("\n]}\n");
return r;
}
void disk_tool_t::dump_meta_header(blockstore_meta_header_v1_t *hdr)
{
if (hdr)
{
printf(
"{\"version\":\"0.6\",\"meta_block_size\":%u,\"data_block_size\":%u,\"bitmap_granularity\":%u,\"entries\":[\n",
hdr->meta_block_size, hdr->data_block_size, hdr->bitmap_granularity
);
}
else
{
printf("{\"version\":\"0.5\",\"meta_block_size\":%lu,\"entries\":[\n", dsk.meta_block_size);
}
first = true;
}
void disk_tool_t::dump_meta_entry(uint64_t block_num, clean_disk_entry *entry, uint8_t *bitmap)
{
printf(
#define ENTRY_FMT "{\"block\":%lu,\"pool\":%u,\"inode\":%lu,\"stripe\":%lu,\"version\":%lu"
(first ? ENTRY_FMT : (",\n" ENTRY_FMT)),
#undef ENTRY_FMT
block_num, INODE_POOL(entry->oid.inode), INODE_NO_POOL(entry->oid.inode),
entry->oid.stripe, entry->version
);
if (bitmap)
{
printf(",\"bitmap\":\"");
for (uint64_t i = 0; i < dsk.clean_entry_bitmap_size; i++)
{
printf("%02x", bitmap[i]);
}
printf("\",\"ext_bitmap\":\"");
for (uint64_t i = 0; i < dsk.clean_entry_bitmap_size; i++)
{
printf("%02x", bitmap[dsk.clean_entry_bitmap_size + i]);
}
printf("\"}");
}
else
{
printf("}");
}
first = false;
}

504
src/disk_tool_prepare.cpp Normal file
View File

@ -0,0 +1,504 @@
// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 (see README.md for details)
#include "disk_tool.h"
#include "str_util.h"
#include "osd_id.h"
int disk_tool_t::prepare_one(std::map<std::string, std::string> options, int is_hdd)
{
static const char *allow_additional_params[] = {
"max_write_iodepth",
"max_write_iodepth",
"min_flusher_count",
"max_flusher_count",
"inmemory_metadata",
"inmemory_journal",
"journal_sector_buffer_count",
"journal_no_same_sector_overwrites",
"throttle_small_writes",
"throttle_target_iops",
"throttle_target_mbs",
"throttle_target_parallelism",
"throttle_threshold_us",
};
if (options.find("force") == options.end())
{
std::vector<std::string> all_devs = { options["data_device"], options["meta_device"], options["journal_device"] };
for (int i = 0; i < all_devs.size(); i++)
{
const auto & dev = all_devs[i];
if (dev == "")
continue;
std::string real_dev = realpath_str(dev, false);
if (real_dev == "")
return 1;
std::string parent_dev = get_parent_device(real_dev);
if (parent_dev == "")
return 1;
if (parent_dev == real_dev)
{
fprintf(stderr, "%s is not a partition, not creating OSD without --force\n", dev.c_str());
return 1;
}
if (i == 0 && is_hdd == -1)
is_hdd = read_file("/sys/block/"+parent_dev+"/queue/rotational") == "1";
std::string out;
if (shell_exec({ "/sbin/blkid", "-D", "-p", dev }, "", &out, NULL) == 0)
{
fprintf(stderr, "%s contains data, not creating OSD without --force. blkid -D -p says:\n%s", dev.c_str(), out.c_str());
return 1;
}
json11::Json sb = read_osd_superblock(dev, false);
if (!sb.is_null())
{
fprintf(stderr, "%s already contains Vitastor OSD superblock, not creating OSD without --force\n", dev.c_str());
return 1;
}
}
}
// Calculate offsets if the same device is used for two or more of data, meta, and journal
if (options["journal_size"] == "")
{
if (options["journal_device"] == "")
options["journal_size"] = "32M";
else if (is_hdd)
options["journal_size"] = DEFAULT_HYBRID_JOURNAL;
}
if (is_hdd)
{
if (options["block_size"] == "")
options["block_size"] = "1M";
if (options["throttle_small_writes"] == "")
options["throttle_small_writes"] = "1";
}
json11::Json::object sb;
blockstore_disk_t dsk;
try
{
dsk.parse_config(options);
dsk.open_data();
dsk.open_meta();
dsk.open_journal();
dsk.calc_lengths(true);
sb = json11::Json::object {
{ "data_device", options["data_device"] },
{ "meta_device", options["meta_device"] },
{ "journal_device", options["journal_device"] },
{ "block_size", (uint64_t)dsk.data_block_size },
{ "meta_block_size", dsk.meta_block_size },
{ "journal_block_size", dsk.journal_block_size },
{ "data_size", dsk.cfg_data_size },
{ "disk_alignment", (uint64_t)dsk.disk_alignment },
{ "bitmap_granularity", dsk.bitmap_granularity },
{ "disable_device_lock", dsk.disable_flock },
{ "journal_offset", 4096 },
{ "meta_offset", 4096 + (dsk.meta_device == dsk.journal_device ? dsk.journal_len : 0) },
{ "data_offset", 4096 + (dsk.data_device == dsk.meta_device ? dsk.meta_len : 0) +
(dsk.data_device == dsk.journal_device ? dsk.journal_len : 0) },
{ "journal_no_same_sector_overwrites", true },
{ "journal_sector_buffer_count", 1024 },
{ "disable_data_fsync", json_is_true(options["disable_data_fsync"]) },
{ "disable_meta_fsync", json_is_true(options["disable_meta_fsync"]) },
{ "disable_journal_fsync", json_is_true(options["disable_journal_fsync"]) },
{ "immediate_commit", json_is_true(options["disable_data_fsync"])
? (json_is_true(options["disable_journal_fsync"]) ? "all" : "small") : "none" },
};
for (int i = 0; i < sizeof(allow_additional_params)/sizeof(allow_additional_params[0]); i++)
{
auto it = options.find(allow_additional_params[i]);
if (it != options.end())
{
sb[it->first] = it->second;
}
}
}
catch (std::exception & e)
{
dsk.close_all();
fprintf(stderr, "%s\n", e.what());
return 1;
}
std::string osd_num_str;
if (shell_exec({ "vitastor-cli", "alloc-osd" }, "", &osd_num_str, NULL) != 0)
{
dsk.close_all();
return 1;
}
osd_num_t osd_num = stoull_full(trim(osd_num_str), 10);
if (!osd_num)
{
dsk.close_all();
fprintf(stderr, "Could not create OSD. vitastor-cli alloc-osd didn't return a valid OSD number:\n%s", osd_num_str.c_str());
return 1;
}
sb["osd_num"] = osd_num;
// Zero out metadata and journal
if (write_zero(dsk.meta_fd, dsk.meta_offset, dsk.meta_len) != 0 ||
write_zero(dsk.journal_fd, dsk.journal_offset, dsk.journal_len) != 0)
{
fprintf(stderr, "Failed to zero out metadata or journal: %s\n", strerror(errno));
dsk.close_all();
return 1;
}
dsk.close_all();
// Write superblocks
if (!write_osd_superblock(options["data_device"], sb) ||
options["meta_device"] != "" &&
options["meta_device"] != options["data_device"] &&
write_osd_superblock(options["meta_device"], sb) ||
options["journal_device"] != "" &&
options["journal_device"] != options["data_device"] &&
options["journal_device"] != options["meta_device"] &&
!write_osd_superblock(options["journal_device"], sb))
{
return 1;
}
return 0;
}
std::vector<vitastor_dev_info_t> disk_tool_t::collect_devices(const std::vector<std::string> & devices)
{
std::vector<vitastor_dev_info_t> devinfo;
for (auto & dev: devices)
{
// Check if the device is a whole disk
if (dev.substr(0, 5) != "/dev/")
{
fprintf(stderr, "%s does not start with /dev/, ignoring\n", dev.c_str());
continue;
}
struct stat dev_st, sys_st;
if (stat(dev.c_str(), &dev_st) < 0)
{
if (errno == ENOENT)
{
fprintf(stderr, "%s does not exist, skipping\n", dev.c_str());
return {};
}
fprintf(stderr, "Error checking %s: %s\n", dev.c_str(), strerror(errno));
return {};
}
if (stat(("/sys/block/"+dev.substr(5)).c_str(), &sys_st) < 0)
{
if (errno == ENOENT)
{
fprintf(stderr, "%s is probably a partition (no entry in /sys/block/), ignoring\n", dev.c_str());
continue;
}
fprintf(stderr, "Error checking /sys/block/%s: %s\n", dev.c_str()+5, strerror(errno));
return {};
}
// Check if the device is an SSD
bool is_hdd = read_file("/sys/block/"+dev.substr(5)+"/queue/rotational") == "1";
// Check if it has a partition table
json11::Json pt = read_parttable(dev);
if (pt.is_bool() && !pt.bool_value())
{
// Error reading table
return {};
}
if (pt.is_null())
{
// No partition table
std::string out;
int r = shell_exec({ "/sbin/blkid", "-p", dev }, "", &out, NULL);
if (r == 0)
{
fprintf(stderr, "%s contains data, skipping:\n %s\n", dev.c_str(), str_replace(trim(out), "\n", "\n ").c_str());
continue;
}
}
int osds = 0;
for (const auto & p: pt["partitions"].array_items())
if (strtolower(p["type"].string_value()) == VITASTOR_PART_TYPE)
osds++;
devinfo.push_back((vitastor_dev_info_t){
.path = dev,
.is_hdd = is_hdd,
.pt = pt,
.osd_part_count = osds,
.size = (uint64_t)dev_st.st_size,
.free = !pt.is_null() ? free_from_parttable(pt) : dev_st.st_size,
});
}
if (!devinfo.size())
{
fprintf(stderr, "No suitable devices found\n");
}
return devinfo;
}
// Return null in case of an error
json11::Json disk_tool_t::add_partitions(vitastor_dev_info_t & devinfo, std::vector<std::string> sizes)
{
std::string script = "label: gpt\n\n";
std::set<std::string> is_old;
for (auto part: devinfo.pt["partitions"].array_items())
{
// Old partitions
is_old.insert(part["uuid"].string_value());
script += part["node"].string_value()+": ";
int n = 0;
for (auto & kv: part.object_items())
{
if (kv.first != "node")
{
script += kv.first+"="+(kv.second.is_string() ? kv.second.string_value() : kv.second.dump());
if (n++)
script += ", ";
}
}
script += "\n";
}
for (auto size: sizes)
{
script += "+ "+size+" "+std::string(VITASTOR_PART_TYPE)+"\n";
}
if (shell_exec({ "/sbin/sfdisk", devinfo.path }, script, NULL, NULL) != 0)
{
fprintf(stderr, "Failed to add %lu partition(s) with sfdisk\n", sizes.size());
return {};
}
// Get new partition table and find created partitions
json11::Json newpt = read_parttable(devinfo.path);
json11::Json::array new_parts;
for (const auto & part: newpt["partitions"].array_items())
{
if (is_old.find(part["uuid"].string_value()) == is_old.end())
{
new_parts.push_back(part);
}
}
if (new_parts.size() != sizes.size())
{
fprintf(stderr, "Failed to add %lu partition(s) with sfdisk: new partitions not found in table\n", sizes.size());
return {};
}
// Wait until device symlinks in /dev/disk/by-partuuid/ appear
bool exists = false;
int iter = 0;
while (!exists && iter < 300) // max 30 sec
{
exists = true;
for (const auto & part: newpt["partitions"].array_items())
{
std::string link_path = "/dev/disk/by-partuuid/"+strtolower(part["uuid"].string_value());
struct stat st;
if (lstat(link_path.c_str(), &st) < 0)
{
if (errno == ENOENT)
exists = false;
else
{
fprintf(stderr, "Failed to lstat %s: %s\n", link_path.c_str(), strerror(errno));
return {};
}
}
}
if (!exists)
{
struct timespec ts = { .tv_sec = 0, .tv_nsec = 100000000 }; // 100ms
iter += (nanosleep(&ts, NULL) == 0);
}
}
devinfo.pt = newpt;
devinfo.osd_part_count += sizes.size();
devinfo.free = free_from_parttable(newpt);
return new_parts;
}
std::vector<std::string> disk_tool_t::get_new_data_parts(vitastor_dev_info_t & dev,
uint64_t osd_per_disk, uint64_t max_other_percent)
{
std::vector<std::string> use_parts;
uint64_t want_parts = 0;
if (dev.pt.is_null())
{
want_parts = osd_per_disk;
}
else if (dev.pt["partitions"].array_items().size() > 0)
{
// Disk already has partitions. If these are empty Vitastor OSD partitions, we can use them
uint64_t osds_exist = 0, osds_size = 0;
for (const auto & part: dev.pt["partitions"].array_items())
{
if (strtolower(part["type"].string_value()) == VITASTOR_PART_TYPE)
{
// Check if an existing Vitastor partition is empty
json11::Json sb = read_osd_superblock(part["node"].string_value(), false);
if (sb.is_null())
{
// Use this partition
use_parts.push_back(part["uuid"].string_value());
}
else
{
fprintf(
stderr, "%s is already initialized for OSD %lu, skipping\n",
part["node"].string_value().c_str(), sb["params"]["osd_num"].uint64_value()
);
osds_exist++;
osds_size += part["size"].uint64_value()*dev.pt["sectorsize"].uint64_value();
}
}
}
// Still create OSD(s) if a disk has no more than (max_other_percent) other data
if (osds_exist >= osd_per_disk || (dev.free+osds_size) < dev.size*(100-max_other_percent)/100)
fprintf(stderr, "%s is already partitioned, skipping\n", dev.path.c_str());
else
want_parts = osd_per_disk-osds_exist;
}
if (want_parts > 0)
{
// Disk is not partitioned yet - create OSD partition(s)
std::vector<std::string> sizes;
auto each_size = std::to_string((dev.free - 1048576) / 1048576 / want_parts)+"MiB";
for (uint64_t i = 0; i < want_parts-1; i++)
sizes.push_back(each_size);
sizes.push_back("+");
auto new_parts = add_partitions(dev, sizes);
for (const auto & part: new_parts.array_items())
use_parts.push_back(part["uuid"].string_value());
}
return use_parts;
}
int disk_tool_t::get_meta_partition(std::vector<vitastor_dev_info_t> & ssds, std::map<std::string, std::string> & options)
{
uint64_t journal_size = parse_size(options["journal_size"]);
journal_size = ((journal_size+1024*1024-1)/1024/1024)*1024*1024;
// Calculate metadata size
uint64_t meta_size = 0;
try
{
blockstore_disk_t dsk;
dsk.parse_config(options);
dsk.open_data();
dsk.open_meta();
dsk.open_journal();
dsk.calc_lengths(true);
dsk.close_all();
meta_size = dsk.meta_len;
}
catch (std::exception & e)
{
fprintf(stderr, "%s\n", e.what());
return 1;
}
// Leave some extra space for future metadata formats and round metadata area size to multiples of 1 MB
uint64_t meta_reserve_multiple = 2, min_meta_size = (uint64_t)1024*1024*1024;
if (options.find("meta_reserve") != options.end())
{
int p1 = options["meta_reserve"].find("x"), p2 = options["meta_reserve"].find(",");
if (p1 >= 0 && p2 >= 0)
{
meta_reserve_multiple = stoull_full(options["meta_reserve"].substr(p1 < p2 ? 0 : p2, p1 - (p1 < p2 ? 0 : p2)));
min_meta_size = parse_size(options["meta_reserve"].substr(p1 < p2 ? p2 : 0, p1 < p2 ? options["meta_reserve"].size()-p2 : p2));
}
else if (p1 >= 0)
meta_reserve_multiple = stoull_full(options["meta_reserve"].substr(0, p1));
else
min_meta_size = parse_size(options["meta_reserve"]);
}
meta_size = ((meta_size+1024*1024-1)/1024/1024)*1024*1024;
meta_size *= meta_reserve_multiple;
if (meta_size < min_meta_size)
meta_size = min_meta_size;
// Pick an SSD for journal&meta, balancing the number of serviced OSDs across SSDs
int sel = -1;
for (int i = 0; i < ssds.size(); i++)
if (ssds[i].free >= (meta_size+journal_size+4096*2) && (sel == -1 || ssds[sel].osd_part_count > ssds[i].osd_part_count))
sel = i;
if (sel < 0)
{
fprintf(
stderr, "Could not find free space for new SSD journal and metadata (need %lu + %lu MiB)\n",
meta_size/1024/1024, journal_size/1024/1024
);
return 1;
}
// Create partitions
auto new_parts = add_partitions(ssds[sel], {
std::to_string(journal_size/1024/1024)+"MiB",
std::to_string(meta_size/1024/1024)+"MiB"
});
if (new_parts.is_null())
{
return 1;
}
ssds[sel].osd_part_count += 2;
options["journal_device"] = "/dev/disk/by-partuuid/"+strtolower(new_parts[0]["uuid"].string_value());
options["meta_device"] = "/dev/disk/by-partuuid/"+strtolower(new_parts[1]["uuid"].string_value());
return 0;
}
int disk_tool_t::prepare(std::vector<std::string> devices)
{
if (options.find("data_device") != options.end() && options["data_device"] != "")
{
if (options.find("hybrid") != options.end() || options.find("osd_per_disk") != options.end() || devices.size())
{
fprintf(stderr, "Device list (positional arguments) and --hybrid are incompatible with --data_device\n");
return 1;
}
return prepare_one(options);
}
if (!devices.size())
{
fprintf(stderr, "Device list missing\n");
return 1;
}
options.erase("data_device");
options.erase("meta_device");
options.erase("journal_device");
auto devinfo = collect_devices(devices);
if (!devinfo.size())
{
return 1;
}
bool hybrid = options.find("hybrid") != options.end();
uint64_t osd_per_disk = stoull_full(options["osd_per_disk"]);
if (!osd_per_disk)
osd_per_disk = 1;
uint64_t max_other_percent = stoull_full(trim(options["max_other"], " \n\r\t%"));
if (max_other_percent > 100)
max_other_percent = 100;
std::vector<vitastor_dev_info_t> ssds;
if (hybrid)
{
for (auto & dev: devinfo)
if (!dev.is_hdd)
ssds.push_back(dev);
if (!ssds.size())
{
fprintf(stderr, "No SSDs found\n");
return 1;
}
if (options["journal_size"] == "")
options["journal_size"] = DEFAULT_HYBRID_JOURNAL;
}
for (auto & dev: devinfo)
{
if (!hybrid || dev.is_hdd)
{
// Select new partitions and create an OSD on each of them
for (const auto & uuid: get_new_data_parts(dev, osd_per_disk, max_other_percent))
{
options["force"] = true;
options["data_device"] = "/dev/disk/by-uuid/"+strtolower(uuid);
if (hybrid)
{
// Select/create journal and metadata partitions
int r = get_meta_partition(ssds, options);
if (r != 0)
{
return 1;
}
}
prepare_one(options, dev.is_hdd ? 1 : 0);
}
}
}
return 0;
}

495
src/disk_tool_resize.cpp Normal file
View File

@ -0,0 +1,495 @@
// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 (see README.md for details)
#include "disk_tool.h"
#include "rw_blocking.h"
#define DM_ST_EMPTY 0
#define DM_ST_TO_READ 1
#define DM_ST_READING 2
#define DM_ST_TO_WRITE 3
#define DM_ST_WRITING 4
struct resizer_data_moving_t
{
int state = 0;
void *buf = NULL;
uint64_t old_loc, new_loc;
};
int disk_tool_t::resize_data()
{
int r;
// Parse parameters
r = resize_parse_params();
if (r != 0)
return r;
// Check parameters and fill allocator
fprintf(stderr, "Reading metadata\n");
data_alloc = new allocator((new_data_len < dsk.data_len ? dsk.data_len : new_data_len) / dsk.data_block_size);
r = process_meta(
[this](blockstore_meta_header_v1_t *hdr)
{
resize_init(hdr);
},
[this](uint64_t block_num, clean_disk_entry *entry, uint8_t *bitmap)
{
data_alloc->set(block_num, true);
}
);
if (r != 0)
return r;
fprintf(stderr, "Reading journal\n");
r = process_journal([this](void *buf)
{
return process_journal_block(buf, [this](int num, journal_entry *je)
{
if (je->type == JE_BIG_WRITE || je->type == JE_BIG_WRITE_INSTANT)
{
data_alloc->set(je->big_write.location / dsk.data_block_size, true);
}
});
});
if (r != 0)
return r;
// Remap blocks
r = resize_remap_blocks();
if (r != 0)
return r;
// Copy data blocks into new places
fprintf(stderr, "Moving data blocks\n");
r = resize_copy_data();
if (r != 0)
return r;
// Rewrite journal
fprintf(stderr, "Rebuilding journal\n");
r = resize_rewrite_journal();
if (r != 0)
return r;
// Rewrite metadata
fprintf(stderr, "Rebuilding metadata\n");
r = resize_rewrite_meta();
if (r != 0)
return r;
// Write new journal
fprintf(stderr, "Writing new journal\n");
r = resize_write_new_journal();
if (r != 0)
return r;
// Write new metadata
fprintf(stderr, "Writing new metadata\n");
r = resize_write_new_meta();
if (r != 0)
return r;
fprintf(stderr, "Done\n");
return 0;
}
int disk_tool_t::resize_parse_params()
{
try
{
dsk.parse_config(options);
dsk.open_data();
dsk.open_meta();
dsk.open_journal();
dsk.calc_lengths();
dsk.close_all();
}
catch (std::exception & e)
{
dsk.close_all();
fprintf(stderr, "Error: %s\n", e.what());
return 1;
}
iodepth = strtoull(options["iodepth"].c_str(), NULL, 10);
if (!iodepth)
iodepth = 32;
new_meta_device = options.find("new_meta_device") != options.end()
? options["new_meta_device"] : dsk.meta_device;
new_journal_device = options.find("new_journal_device") != options.end()
? options["new_journal_device"] : dsk.journal_device;
new_data_offset = options.find("new_data_offset") != options.end()
? strtoull(options["new_data_offset"].c_str(), NULL, 10) : dsk.data_offset;
new_data_len = options.find("new_data_len") != options.end()
? strtoull(options["new_data_len"].c_str(), NULL, 10) : dsk.data_len;
new_meta_offset = options.find("new_meta_offset") != options.end()
? strtoull(options["new_meta_offset"].c_str(), NULL, 10) : dsk.meta_offset;
new_meta_len = options.find("new_meta_len") != options.end()
? strtoull(options["new_meta_len"].c_str(), NULL, 10) : 0; // will be calculated in resize_init()
new_journal_offset = options.find("new_journal_offset") != options.end()
? strtoull(options["new_journal_offset"].c_str(), NULL, 10) : dsk.journal_offset;
new_journal_len = options.find("new_journal_len") != options.end()
? strtoull(options["new_journal_len"].c_str(), NULL, 10) : dsk.journal_len;
if (new_meta_device == dsk.meta_device &&
new_journal_device == dsk.journal_device &&
new_data_offset == dsk.data_offset &&
new_data_len == dsk.data_len &&
new_meta_offset == dsk.meta_offset &&
(new_meta_len == dsk.meta_len || new_meta_len == 0) &&
new_journal_offset == dsk.journal_offset &&
new_journal_len == dsk.journal_len &&
options.find("force") == options.end())
{
// No difference
fprintf(stderr, "No difference, specify --force to rewrite journal and meta anyway\n");
return 1;
}
return 0;
}
void disk_tool_t::resize_init(blockstore_meta_header_v1_t *hdr)
{
if (hdr && dsk.data_block_size != hdr->data_block_size)
{
if (dsk.data_block_size)
{
fprintf(stderr, "Using data block size of %u bytes from metadata superblock\n", hdr->data_block_size);
}
dsk.data_block_size = hdr->data_block_size;
}
if (((new_data_len-dsk.data_len) % dsk.data_block_size) ||
((new_data_offset-dsk.data_offset) % dsk.data_block_size))
{
fprintf(stderr, "Data alignment mismatch\n");
exit(1);
}
data_idx_diff = ((int64_t)(dsk.data_offset-new_data_offset)) / dsk.data_block_size;
free_first = new_data_offset > dsk.data_offset ? (new_data_offset-dsk.data_offset) / dsk.data_block_size : 0;
free_last = (new_data_offset+new_data_len < dsk.data_offset+dsk.data_len)
? (dsk.data_offset+dsk.data_len-new_data_offset-new_data_len) / dsk.data_block_size
: 0;
new_clean_entry_bitmap_size = dsk.data_block_size / (hdr ? hdr->bitmap_granularity : 4096) / 8;
new_clean_entry_size = sizeof(clean_disk_entry) + 2 * new_clean_entry_bitmap_size;
new_entries_per_block = dsk.meta_block_size/new_clean_entry_size;
uint64_t new_meta_blocks = 1 + (new_data_len/dsk.data_block_size + new_entries_per_block-1) / new_entries_per_block;
if (!new_meta_len)
{
new_meta_len = dsk.meta_block_size*new_meta_blocks;
}
if (new_meta_len < dsk.meta_block_size*new_meta_blocks)
{
fprintf(stderr, "New metadata area size is too small, should be at least %lu bytes\n", dsk.meta_block_size*new_meta_blocks);
exit(1);
}
// Check that new metadata, journal and data areas don't overlap
if (new_meta_device == dsk.data_device && new_meta_offset < new_data_offset+new_data_len &&
new_meta_offset+new_meta_len > new_data_offset)
{
fprintf(stderr, "New metadata area overlaps with data\n");
exit(1);
}
if (new_journal_device == dsk.data_device && new_journal_offset < new_data_offset+new_data_len &&
new_journal_offset+new_journal_len > new_data_offset)
{
fprintf(stderr, "New journal area overlaps with data\n");
exit(1);
}
if (new_journal_device == new_meta_device && new_journal_offset < new_meta_offset+new_meta_len &&
new_journal_offset+new_journal_len > new_meta_offset)
{
fprintf(stderr, "New journal area overlaps with metadata\n");
exit(1);
}
}
int disk_tool_t::resize_remap_blocks()
{
total_blocks = dsk.data_len / dsk.data_block_size;
for (uint64_t i = 0; i < free_first; i++)
{
if (data_alloc->get(i))
data_remap[i] = 0;
else
data_alloc->set(i, true);
}
for (uint64_t i = 0; i < free_last; i++)
{
if (data_alloc->get(total_blocks-i))
data_remap[total_blocks-i] = 0;
else
data_alloc->set(total_blocks-i, true);
}
for (auto & p: data_remap)
{
uint64_t new_loc = data_alloc->find_free();
if (new_loc == UINT64_MAX)
{
fprintf(stderr, "Not enough space to move data\n");
return 1;
}
data_alloc->set(new_loc, true);
data_remap[p.first] = new_loc;
}
return 0;
}
int disk_tool_t::resize_copy_data()
{
if (iodepth <= 0 || iodepth > 4096)
{
iodepth = 32;
}
ringloop = new ring_loop_t(iodepth < 512 ? 512 : iodepth);
dsk.data_fd = open(dsk.data_device.c_str(), O_DIRECT|O_RDWR);
if (dsk.data_fd < 0)
{
fprintf(stderr, "Failed to open data device %s: %s\n", dsk.data_device.c_str(), strerror(errno));
delete ringloop;
ringloop = NULL;
return 1;
}
moving_blocks = new resizer_data_moving_t[iodepth];
moving_blocks[0].buf = memalign_or_die(MEM_ALIGNMENT, iodepth*dsk.data_block_size);
for (int i = 1; i < iodepth; i++)
{
moving_blocks[i].buf = (uint8_t*)moving_blocks[0].buf + i*dsk.data_block_size;
}
remap_active = 1;
remap_it = data_remap.begin();
ring_consumer.loop = [this]()
{
remap_active = 0;
for (int i = 0; i < iodepth; i++)
{
if (moving_blocks[i].state == DM_ST_EMPTY && remap_it != data_remap.end())
{
uint64_t old_loc = remap_it->first, new_loc = remap_it->second;
moving_blocks[i].state = DM_ST_TO_READ;
moving_blocks[i].old_loc = old_loc;
moving_blocks[i].new_loc = new_loc;
remap_it++;
}
if (moving_blocks[i].state == DM_ST_TO_READ)
{
struct io_uring_sqe *sqe = ringloop->get_sqe();
if (sqe)
{
moving_blocks[i].state = DM_ST_READING;
struct ring_data_t *data = ((ring_data_t*)sqe->user_data);
data->iov = (struct iovec){ moving_blocks[i].buf, dsk.data_block_size };
my_uring_prep_readv(sqe, dsk.data_fd, &data->iov, 1, dsk.data_offset + moving_blocks[i].old_loc*dsk.data_block_size);
data->callback = [this, i](ring_data_t *data)
{
if (data->res != dsk.data_block_size)
{
fprintf(
stderr, "Failed to read %u bytes at %lu from %s: %s\n", dsk.data_block_size,
dsk.data_offset + moving_blocks[i].old_loc*dsk.data_block_size, dsk.data_device.c_str(),
data->res < 0 ? strerror(-data->res) : "short read"
);
exit(1);
}
moving_blocks[i].state = DM_ST_TO_WRITE;
ringloop->wakeup();
};
}
}
if (moving_blocks[i].state == DM_ST_TO_WRITE)
{
struct io_uring_sqe *sqe = ringloop->get_sqe();
if (sqe)
{
moving_blocks[i].state = DM_ST_WRITING;
struct ring_data_t *data = ((ring_data_t*)sqe->user_data);
data->iov = (struct iovec){ moving_blocks[i].buf, dsk.data_block_size };
my_uring_prep_writev(sqe, dsk.data_fd, &data->iov, 1, dsk.data_offset + moving_blocks[i].new_loc*dsk.data_block_size);
data->callback = [this, i](ring_data_t *data)
{
if (data->res != dsk.data_block_size)
{
fprintf(
stderr, "Failed to write %u bytes at %lu to %s: %s\n", dsk.data_block_size,
dsk.data_offset + moving_blocks[i].new_loc*dsk.data_block_size, dsk.data_device.c_str(),
data->res < 0 ? strerror(-data->res) : "short write"
);
exit(1);
}
moving_blocks[i].state = DM_ST_EMPTY;
ringloop->wakeup();
};
}
}
remap_active += moving_blocks[i].state != DM_ST_EMPTY ? 1 : 0;
}
ringloop->submit();
};
ringloop->register_consumer(&ring_consumer);
while (1)
{
ringloop->loop();
if (!remap_active)
break;
ringloop->wait();
}
ringloop->unregister_consumer(&ring_consumer);
free(moving_blocks[0].buf);
delete[] moving_blocks;
moving_blocks = NULL;
close(dsk.data_fd);
dsk.data_fd = -1;
delete ringloop;
ringloop = NULL;
return 0;
}
int disk_tool_t::resize_rewrite_journal()
{
// Simply overwriting on the fly may be impossible because old and new areas may overlap
// For now, just build new journal data in memory
new_journal_buf = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, new_journal_len);
new_journal_ptr = new_journal_buf;
new_journal_data = new_journal_ptr + dsk.journal_block_size;
new_journal_in_pos = 0;
memset(new_journal_buf, 0, new_journal_len);
process_journal([this](void *buf)
{
return process_journal_block(buf, [this](int num, journal_entry *je)
{
if (je->type == JE_START)
{
journal_entry *ne = (journal_entry*)(new_journal_ptr + new_journal_in_pos);
*((journal_entry_start*)ne) = (journal_entry_start){
.magic = JOURNAL_MAGIC,
.type = JE_START,
.size = sizeof(journal_entry_start),
.journal_start = dsk.journal_block_size,
.version = JOURNAL_VERSION,
};
ne->crc32 = je_crc32(ne);
new_journal_ptr += dsk.journal_block_size;
new_journal_data = new_journal_ptr+dsk.journal_block_size;
new_journal_in_pos = 0;
}
else
{
if (dsk.journal_block_size < new_journal_in_pos+je->size)
{
new_journal_ptr = new_journal_data;
if (new_journal_ptr-new_journal_buf >= new_journal_len)
{
fprintf(stderr, "Error: live entries don't fit to the new journal\n");
exit(1);
}
new_journal_data = new_journal_ptr+dsk.journal_block_size;
new_journal_in_pos = 0;
if (dsk.journal_block_size < je->size)
{
fprintf(stderr, "Error: journal entry too large (%u bytes)\n", je->size);
exit(1);
}
}
journal_entry *ne = (journal_entry*)(new_journal_ptr + new_journal_in_pos);
memcpy(ne, je, je->size);
ne->crc32_prev = new_crc32_prev;
if (je->type == JE_BIG_WRITE || je->type == JE_BIG_WRITE_INSTANT)
{
// Change the block reference
auto remap_it = data_remap.find(ne->big_write.location / dsk.data_block_size);
if (remap_it != data_remap.end())
{
ne->big_write.location = remap_it->second * dsk.data_block_size;
}
ne->big_write.location += data_idx_diff * dsk.data_block_size;
}
else if (je->type == JE_SMALL_WRITE || je->type == JE_SMALL_WRITE_INSTANT)
{
ne->small_write.data_offset = new_journal_data-new_journal_buf;
if (ne->small_write.data_offset + ne->small_write.len > new_journal_len)
{
fprintf(stderr, "Error: live entries don't fit to the new journal\n");
exit(1);
}
memcpy(new_journal_data, small_write_data, ne->small_write.len);
new_journal_data += ne->small_write.len;
}
ne->crc32 = je_crc32(ne);
new_journal_in_pos += ne->size;
new_crc32_prev = ne->crc32;
}
});
});
return 0;
}
int disk_tool_t::resize_write_new_journal()
{
new_journal_fd = open(new_journal_device.c_str(), O_DIRECT|O_RDWR);
if (new_journal_fd < 0)
{
fprintf(stderr, "Failed to open new journal device %s: %s\n", new_journal_device.c_str(), strerror(errno));
return 1;
}
lseek64(new_journal_fd, new_journal_offset, 0);
write_blocking(new_journal_fd, new_journal_buf, new_journal_len);
fsync(new_journal_fd);
close(new_journal_fd);
new_journal_fd = -1;
free(new_journal_buf);
new_journal_buf = NULL;
return 0;
}
int disk_tool_t::resize_rewrite_meta()
{
new_meta_buf = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, new_meta_len);
memset(new_meta_buf, 0, new_meta_len);
int r = process_meta(
[this](blockstore_meta_header_v1_t *hdr)
{
blockstore_meta_header_v1_t *new_hdr = (blockstore_meta_header_v1_t *)new_meta_buf;
new_hdr->zero = 0;
new_hdr->magic = BLOCKSTORE_META_MAGIC_V1;
new_hdr->version = BLOCKSTORE_META_VERSION_V1;
new_hdr->meta_block_size = dsk.meta_block_size;
new_hdr->data_block_size = dsk.data_block_size;
new_hdr->bitmap_granularity = dsk.bitmap_granularity ? dsk.bitmap_granularity : 4096;
},
[this](uint64_t block_num, clean_disk_entry *entry, uint8_t *bitmap)
{
auto remap_it = data_remap.find(block_num);
if (remap_it != data_remap.end())
block_num = remap_it->second;
if (block_num < free_first || block_num >= total_blocks-free_last)
{
fprintf(stderr, "BUG: remapped block not in range\n");
exit(1);
}
block_num += data_idx_diff;
clean_disk_entry *new_entry = (clean_disk_entry*)(new_meta_buf + dsk.meta_block_size +
dsk.meta_block_size*(block_num / new_entries_per_block) +
new_clean_entry_size*(block_num % new_entries_per_block));
new_entry->oid = entry->oid;
new_entry->version = entry->version;
if (bitmap)
memcpy(new_entry->bitmap, bitmap, 2*new_clean_entry_bitmap_size);
else
memset(new_entry->bitmap, 0xff, 2*new_clean_entry_bitmap_size);
}
);
if (r != 0)
{
free(new_meta_buf);
new_meta_buf = NULL;
return r;
}
return 0;
}
int disk_tool_t::resize_write_new_meta()
{
new_meta_fd = open(new_meta_device.c_str(), O_DIRECT|O_RDWR);
if (new_meta_fd < 0)
{
fprintf(stderr, "Failed to open new metadata device %s: %s\n", new_meta_device.c_str(), strerror(errno));
return 1;
}
lseek64(new_meta_fd, new_meta_offset, 0);
write_blocking(new_meta_fd, new_meta_buf, new_meta_len);
fsync(new_meta_fd);
close(new_meta_fd);
new_meta_fd = -1;
free(new_meta_buf);
new_meta_buf = NULL;
return 0;
}

426
src/disk_tool_udev.cpp Normal file
View File

@ -0,0 +1,426 @@
// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 (see README.md for details)
#include <dirent.h>
#include "disk_tool.h"
#include "rw_blocking.h"
struct __attribute__((__packed__)) vitastor_disk_superblock_t
{
uint64_t magic;
uint32_t crc32c;
uint32_t size;
uint8_t json_data[];
};
static std::string udev_escape(std::string str)
{
std::string r;
int p = str.find_first_of("\"\' \t\r\n"), prev = 0;
if (p == std::string::npos)
{
return str;
}
while (p != std::string::npos)
{
r += str.substr(prev, p-prev);
r += "\\";
prev = p;
p = str.find_first_of("\"\' \t\r\n", p+1);
}
r += str.substr(prev);
return r;
}
int disk_tool_t::udev_import(std::string device)
{
json11::Json sb = read_osd_superblock(device);
if (sb.is_null())
{
return 1;
}
uint64_t osd_num = sb["params"]["osd_num"].uint64_value();
// Print variables for udev
printf("VITASTOR_OSD_NUM=%lu\n", osd_num);
printf("VITASTOR_ALIAS=osd%lu%s\n", osd_num, sb["device_type"].string_value().c_str());
printf("VITASTOR_DATA_DEVICE=%s\n", udev_escape(sb["params"]["data_device"].string_value()).c_str());
if (sb["real_meta_device"].string_value() != "" && sb["real_meta_device"] != sb["real_data_device"])
printf("VITASTOR_META_DEVICE=%s\n", udev_escape(sb["params"]["meta_device"].string_value()).c_str());
if (sb["real_journal_device"].string_value() != "" && sb["real_journal_device"] != sb["real_meta_device"])
printf("VITASTOR_JOURNAL_DEVICE=%s\n", udev_escape(sb["params"]["journal_device"].string_value()).c_str());
return 0;
}
int disk_tool_t::read_sb(std::string device)
{
json11::Json sb = read_osd_superblock(device);
if (sb.is_null())
{
return 1;
}
printf("%s\n", sb["params"].dump().c_str());
return 0;
}
int disk_tool_t::write_sb(std::string device)
{
std::string input;
int r;
char buf[4096];
while (1)
{
r = read(0, buf, sizeof(buf));
if (r <= 0 && errno != EAGAIN)
break;
input += std::string(buf, r);
}
std::string json_err;
json11::Json params = json11::Json::parse(input, json_err);
if (json_err != "" || !params["osd_num"].uint64_value() || params["data_device"].string_value() == "")
{
fprintf(stderr, "Invalid JSON input\n");
return 1;
}
return !write_osd_superblock(device, params);
}
uint32_t disk_tool_t::write_osd_superblock(std::string device, json11::Json params)
{
std::string json_data = params.dump();
uint32_t sb_size = sizeof(vitastor_disk_superblock_t)+json_data.size();
if (sb_size > VITASTOR_DISK_MAX_SB_SIZE)
{
fprintf(stderr, "JSON data for superblock is too large\n");
return 0;
}
uint64_t buf_len = ((sb_size+4095)/4096) * 4096;
uint8_t *buf = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, buf_len);
memset(buf, 0, buf_len);
vitastor_disk_superblock_t *sb = (vitastor_disk_superblock_t*)buf;
sb->magic = VITASTOR_DISK_MAGIC;
sb->size = sb_size;
memcpy(sb->json_data, json_data.c_str(), json_data.size());
sb->crc32c = crc32c(0, &sb->size, sb->size - ((uint8_t*)&sb->size - buf));
int fd = open(device.c_str(), O_DIRECT|O_RDWR);
if (fd < 0)
{
fprintf(stderr, "Failed to open device %s: %s\n", device.c_str(), strerror(errno));
free(buf);
return 0;
}
int r = write_blocking(fd, buf, buf_len);
if (r < 0)
{
fprintf(stderr, "Failed to write to %s: %s\n", device.c_str(), strerror(errno));
close(fd);
free(buf);
return 0;
}
close(fd);
free(buf);
return sb_size;
}
json11::Json disk_tool_t::read_osd_superblock(std::string device, bool expect_exist)
{
vitastor_disk_superblock_t *sb = NULL;
uint8_t *buf = NULL;
json11::Json osd_params;
std::string json_err;
std::string real_device, device_type, real_data, real_meta, real_journal;
int r, fd = open(device.c_str(), O_DIRECT|O_RDWR);
if (fd < 0)
{
fprintf(stderr, "Failed to open device %s: %s\n", device.c_str(), strerror(errno));
return osd_params;
}
buf = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, 4096);
r = read_blocking(fd, buf, 4096);
if (r != 4096)
{
fprintf(stderr, "Failed to read OSD superblock from %s: %s\n", device.c_str(), strerror(errno));
goto ex;
}
sb = (vitastor_disk_superblock_t*)buf;
if (sb->magic != VITASTOR_DISK_MAGIC)
{
if (expect_exist)
fprintf(stderr, "Invalid OSD superblock on %s: magic number mismatch\n", device.c_str());
goto ex;
}
if (sb->size > VITASTOR_DISK_MAX_SB_SIZE ||
// +2 is minimal json: {}
sb->size < sizeof(vitastor_disk_superblock_t)+2)
{
if (expect_exist)
fprintf(stderr, "Invalid OSD superblock on %s: invalid size\n", device.c_str());
goto ex;
}
if (sb->size > 4096)
{
uint64_t sb_size = ((sb->size+4095)/4096)*4096;
free(buf);
buf = (uint8_t*)memalign_or_die(MEM_ALIGNMENT, sb_size);
lseek64(fd, 0, 0);
r = read_blocking(fd, buf, sb_size);
if (r != sb_size)
{
fprintf(stderr, "Failed to read OSD superblock from %s: %s\n", device.c_str(), strerror(errno));
goto ex;
}
sb = (vitastor_disk_superblock_t*)buf;
}
if (sb->crc32c != crc32c(0, &sb->size, sb->size - ((uint8_t*)&sb->size - buf)))
{
if (expect_exist)
fprintf(stderr, "Invalid OSD superblock on %s: crc32 mismatch\n", device.c_str());
goto ex;
}
osd_params = json11::Json::parse(std::string((char*)sb->json_data, sb->size - sizeof(vitastor_disk_superblock_t)), json_err);
if (json_err != "")
{
if (expect_exist)
fprintf(stderr, "Invalid OSD superblock on %s: invalid JSON\n", device.c_str());
goto ex;
}
// Validate superblock
if (!osd_params["osd_num"].uint64_value())
{
if (expect_exist)
fprintf(stderr, "OSD superblock on %s lacks osd_num\n", device.c_str());
osd_params = json11::Json();
goto ex;
}
if (osd_params["data_device"].string_value() == "")
{
if (expect_exist)
fprintf(stderr, "OSD superblock on %s lacks data_device\n", device.c_str());
osd_params = json11::Json();
goto ex;
}
real_device = realpath_str(device);
real_data = realpath_str(osd_params["data_device"].string_value());
real_meta = osd_params["meta_device"] != "" && osd_params["meta_device"] != osd_params["data_device"]
? realpath_str(osd_params["meta_device"].string_value()) : "";
real_journal = osd_params["journal_device"] != "" && osd_params["journal_device"] != osd_params["meta_device"]
? realpath_str(osd_params["journal_device"].string_value()) : "";
if (real_journal == real_meta)
{
real_journal = "";
}
if (real_meta == real_data)
{
real_meta = "";
}
if (real_device == real_data)
{
device_type = "data";
}
else if (real_device == real_meta)
{
device_type = "meta";
}
else if (real_device == real_journal)
{
device_type = "journal";
}
else
{
if (expect_exist)
fprintf(stderr, "Invalid OSD superblock on %s: does not refer to the device itself\n", device.c_str());
osd_params = json11::Json();
goto ex;
}
osd_params = json11::Json::object{
{ "params", osd_params },
{ "device_type", device_type },
{ "real_data_device", real_data },
{ "real_meta_device", real_meta },
{ "real_journal_device", real_journal },
};
ex:
free(buf);
close(fd);
return osd_params;
}
int disk_tool_t::systemd_start_stop_osds(std::vector<std::string> cmd, std::vector<std::string> devices)
{
if (!devices.size())
{
fprintf(stderr, "Device path is missing\n");
return 1;
}
std::vector<std::string> svcs;
for (auto & device: devices)
{
json11::Json sb = read_osd_superblock(device);
if (!sb.is_null())
{
svcs.push_back("vitastor-osd@"+sb["params"]["osd_num"].as_string());
}
}
if (!svcs.size())
{
return 1;
}
std::vector<char*> argv;
argv.push_back((char*)"systemctl");
for (auto & s: cmd)
{
argv.push_back((char*)s.c_str());
}
for (auto & s: svcs)
{
argv.push_back((char*)s.c_str());
}
argv.push_back(NULL);
execvpe("systemctl", argv.data(), environ);
return 0;
}
int disk_tool_t::exec_osd(std::string device)
{
json11::Json sb = read_osd_superblock(device);
if (sb.is_null())
{
return 1;
}
std::string osd_binary = "vitastor-osd";
if (options["osd-binary"] != "")
{
osd_binary = options["osd-binary"];
}
std::vector<std::string> argstr;
argstr.push_back(osd_binary.c_str());
for (auto & kv: sb["params"].object_items())
{
argstr.push_back("--"+kv.first);
argstr.push_back(kv.second.is_string() ? kv.second.string_value() : kv.second.dump());
}
char *argv[argstr.size()+1];
for (int i = 0; i < argstr.size(); i++)
{
argv[i] = (char*)argstr[i].c_str();
}
argv[argstr.size()] = NULL;
execvpe(osd_binary.c_str(), argv, environ);
return 0;
}
// returns 1 = warning, -1 = error, 0 = success
static int disable_cache(std::string dev)
{
auto parent_dev = get_parent_device(dev);
if (parent_dev == "")
return 1;
auto scsi_disk = "/sys/block/"+parent_dev+"/device/scsi_disk";
DIR *dir = opendir(scsi_disk.c_str());
if (!dir)
{
if (errno == ENOENT)
{
// Not a SCSI/SATA device, just check /sys/block/.../queue/write_cache
return check_queue_cache(dev, parent_dev);
}
else
{
fprintf(stderr, "Can't read directory %s: %s\n", scsi_disk.c_str(), strerror(errno));
return 1;
}
}
else
{
dirent *de = readdir(dir);
while (de && de->d_name[0] == '.' && (de->d_name[1] == 0 || de->d_name[1] == '.' && de->d_name[2] == 0))
de = readdir(dir);
if (!de)
{
// Not a SCSI/SATA device, just check /sys/block/.../queue/write_cache
closedir(dir);
return check_queue_cache(dev, parent_dev);
}
scsi_disk += "/";
scsi_disk += de->d_name;
if (readdir(dir) != NULL)
{
// Error, multiple scsi_disk/* entries
closedir(dir);
fprintf(stderr, "Multiple entries in %s found\n", scsi_disk.c_str());
return 1;
}
closedir(dir);
// Check cache_type
scsi_disk += "/cache_type";
std::string cache_type = read_file(scsi_disk);
if (cache_type == "")
return 1;
if (cache_type == "write back")
{
int fd = open(scsi_disk.c_str(), O_WRONLY);
if (fd < 0 || write_blocking(fd, (void*)"write through", strlen("write through")) != strlen("write through"))
{
if (fd >= 0)
close(fd);
fprintf(stderr, "Can't write to %s: %s\n", scsi_disk.c_str(), strerror(errno));
return -1;
}
close(fd);
}
}
return 0;
}
static int check_disabled_cache(std::string dev)
{
int r = disable_cache(dev);
if (r == 1)
{
fprintf(
stderr, "Warning: fsync is disabled for %s, but cache status check failed."
" Ensure that cache is in write-through mode yourself or you may lose data.\n", dev.c_str()
);
}
else if (r == -1)
{
fprintf(
stderr, "Error: fsync is disabled for %s, but its cache is in write-back mode"
" and we failed to make it write-through. Data loss is presumably possible."
" Either switch the cache to write-through mode yourself or disable the check"
" using skip_cache_check=1 in the superblock.\n", dev.c_str()
);
return 1;
}
return 0;
}
int disk_tool_t::pre_exec_osd(std::string device)
{
json11::Json sb = read_osd_superblock(device);
if (sb.is_null())
{
return 1;
}
if (!sb["params"]["skip_cache_check"].uint64_value())
{
if (json_is_true(sb["params"]["disable_data_fsync"]) &&
check_disabled_cache(sb["real_data_device"].string_value()) != 0)
{
return 1;
}
if (json_is_true(sb["params"]["disable_meta_fsync"]) &&
sb["real_meta_device"].string_value() != "" && sb["real_meta_device"] != sb["real_data_device"] &&
check_disabled_cache(sb["real_meta_device"].string_value()) != 0)
{
return 1;
}
if (json_is_true(sb["params"]["disable_journal_fsync"]) &&
sb["real_journal_device"].string_value() != "" && sb["real_journal_device"] != sb["real_meta_device"] &&
check_disabled_cache(sb["real_journal_device"].string_value()) != 0)
{
return 1;
}
}
return 0;
}

243
src/disk_tool_utils.cpp Normal file
View File

@ -0,0 +1,243 @@
// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 (see README.md for details)
#include <sys/wait.h>
#include "disk_tool.h"
#include "rw_blocking.h"
std::string realpath_str(std::string path, bool nofail)
{
char *p = realpath((char*)path.c_str(), NULL);
if (!p)
{
fprintf(stderr, "Failed to resolve %s: %s\n", path.c_str(), strerror(errno));
return nofail ? path : "";
}
std::string rp(p);
free(p);
return rp;
}
std::string read_all_fd(int fd)
{
int res_size = 0;
std::string res;
while (1)
{
res.resize(res_size+1024);
int r = read(fd, res.data()+res_size, res.size()-res_size);
if (r > 0)
res_size += r;
else if (!r || errno != EAGAIN && errno != EINTR)
break;
}
res.resize(res_size);
return res;
}
std::string read_file(std::string file)
{
std::string res;
int fd = open(file.c_str(), O_RDONLY);
if (fd < 0 || (res = read_all_fd(fd)) == "")
{
if (fd >= 0)
close(fd);
fprintf(stderr, "Can't read %s: %s\n", file.c_str(), strerror(errno));
return "";
}
close(fd);
return res;
}
int check_queue_cache(std::string dev, std::string parent_dev)
{
auto r = read_file("/sys/block/"+dev+"/queue/write_cache");
if (r == "")
r = read_file("/sys/block/"+parent_dev+"/queue/write_cache");
if (r == "")
return 1;
return r == "write through" ? 0 : -1;
}
std::string get_parent_device(std::string dev)
{
if (dev.substr(0, 5) != "/dev/")
{
fprintf(stderr, "%s is outside /dev/\n", dev.c_str());
return "";
}
dev = dev.substr(5);
int i = dev.size();
while (i > 0 && isdigit(dev[i-1]))
i--;
if (i >= 1 && dev[i-1] == '-') // dm-0, dm-1
return dev;
else if (i >= 2 && dev[i-1] == 'p' && isdigit(dev[i-2])) // nvme0n1p1
i--;
// Check that such block device exists
struct stat st;
auto chk = "/sys/block/"+dev.substr(0, i);
if (stat(chk.c_str(), &st) < 0)
{
if (errno != ENOENT)
{
fprintf(stderr, "Failed to stat %s: %s\n", chk.c_str(), strerror(errno));
return "";
}
return dev;
}
return dev.substr(0, i);
}
bool json_is_true(const json11::Json & val)
{
if (val.is_string())
return val == "true" || val == "yes" || val == "1";
return val.bool_value();
}
int shell_exec(const std::vector<std::string> & cmd, const std::string & in, std::string *out, std::string *err)
{
int child_stdin[2], child_stdout[2], child_stderr[2];
pid_t pid;
if (pipe(child_stdin) == -1)
goto err_pipe1;
if (pipe(child_stdout) == -1)
goto err_pipe2;
if (pipe(child_stderr) == -1)
goto err_pipe3;
if ((pid = fork()) == -1)
goto err_fork;
if (pid)
{
// Parent
// We should do select() to do something serious, but this is for simple cases
close(child_stdin[0]);
close(child_stdout[1]);
close(child_stderr[1]);
write_blocking(child_stdin[1], (void*)in.data(), in.size());
close(child_stdin[1]);
std::string s;
s = read_all_fd(child_stdout[0]);
if (out)
out->swap(s);
close(child_stdout[0]);
s = read_all_fd(child_stderr[0]);
if (err)
err->swap(s);
close(child_stderr[0]);
int wstatus = 0;
waitpid(pid, &wstatus, 0);
return WEXITSTATUS(wstatus);
}
else
{
// Child
dup2(child_stdin[0], 0);
dup2(child_stdout[1], 1);
if (err)
dup2(child_stderr[1], 2);
close(child_stdin[0]);
close(child_stdin[1]);
close(child_stdout[0]);
close(child_stdout[1]);
close(child_stderr[0]);
close(child_stderr[1]);
//char *argv[] = { (char*)"/bin/sh", (char*)"-c", (char*)cmd.c_str(), NULL };
char *argv[cmd.size()+1];
for (int i = 0; i < cmd.size(); i++)
{
argv[i] = (char*)cmd[i].c_str();
}
argv[cmd.size()-1] = NULL;
execvp(argv[0], argv);
std::string full_cmd;
for (int i = 0; i < cmd.size(); i++)
{
full_cmd += cmd[i];
full_cmd += " ";
}
full_cmd.resize(full_cmd.size() > 0 ? full_cmd.size()-1 : 0);
fprintf(stderr, "error running %s: %s", full_cmd.c_str(), strerror(errno));
exit(255);
}
err_fork:
close(child_stderr[1]);
close(child_stderr[0]);
err_pipe3:
close(child_stdout[1]);
close(child_stdout[0]);
err_pipe2:
close(child_stdin[1]);
close(child_stdin[0]);
err_pipe1:
return 255;
}
int write_zero(int fd, uint64_t offset, uint64_t size)
{
uint64_t buf_len = 1024*1024;
void *zero_buf = memalign_or_die(MEM_ALIGNMENT, buf_len);
ssize_t r;
while (size > 0)
{
r = pwrite(fd, zero_buf, size > buf_len ? buf_len : size, offset);
if (r > 0)
{
size -= r;
offset += r;
}
else if (errno != EAGAIN && errno != EINTR)
{
free(zero_buf);
return -1;
}
}
free(zero_buf);
return 0;
}
// Returns false in case of an error
// Returns null if there is no partition table
json11::Json read_parttable(std::string dev)
{
std::string part_dump;
int r = shell_exec({ "/sbin/sfdisk", "--dump", dev, "--json" }, "", &part_dump, NULL);
if (r == 255)
{
fprintf(stderr, "Error running /sbin/sfdisk --dump %s --json\n", dev.c_str());
return json11::Json(false);
}
// Decode partition table
json11::Json pt;
if (part_dump != "")
{
std::string err;
pt = json11::Json::parse(part_dump, err);
if (err != "")
{
fprintf(stderr, "sfdisk --dump %s --json returned bad JSON: %s\n", dev.c_str(), part_dump.c_str());
return json11::Json(false);
}
pt = pt["partitiontable"];
if (pt.is_object() && pt["label"].string_value() != "gpt")
{
fprintf(stderr, "%s contains \"%s\" partition table, only GPT is supported, skipping\n", dev.c_str(), pt["label"].string_value().c_str());
return json11::Json(false);
}
}
return pt;
}
uint64_t free_from_parttable(json11::Json pt)
{
uint64_t free = pt["lastlba"].uint64_value() + 1 - pt["firstlba"].uint64_value();
for (const auto & part: pt["partitions"].array_items())
{
free -= part["size"].uint64_value();
}
free *= pt["sectorsize"].uint64_value();
return free;
}