Browse Source

WIP ceph-disk prepare

Vitaliy Filippov 3 days ago
parent
commit
1d09443c92
  1. 2
      mon/make-osd-hybrid.js
  2. 251
      src/disk_tool.cpp

2
mon/make-osd-hybrid.js

@ -104,7 +104,7 @@ async function collect_devices(devices_to_check)
if (has_partition_table != 0)
{
// Check if the device has any data
const [ has_data, out ] = await system(`blkid ${dev}`);
const [ has_data, out ] = await system(`blkid -p ${dev}`);
if (has_data == 0)
{
console.log(`${dev} contains data, skipping:\n ${out.trim().replace(/\n/g, '\n ')}`);

251
src/disk_tool.cpp

@ -3,6 +3,7 @@
#define _LARGEFILE64_SOURCE
#include <sys/types.h>
#include <sys/wait.h>
#include <dirent.h>
#include <sys/ioctl.h>
#include <sys/stat.h>
@ -56,6 +57,37 @@ static const char *help_text =
"\n"
"COMMANDS:\n"
"\n"
"vitastor-disk prepare [OPTIONS] [devices...]\n"
" Initialize disk(s) for Vitastor OSD(s).\n"
" There are two forms of this command. In the first form, you pass <devices> which\n"
" must be raw disks (not partitions). They are partitioned automatically and OSDs\n"
" are initialized on all of them.\n"
" In the second form, you omit <devices> and pass --data_device, --journal_device\n"
" and/or --meta_device which must be already existing partitions. In this case\n"
" a single OSD is created.\n"
" OPTIONS may include:\n"
" --hybrid\n"
" Prepare hybrid (HDD+SSD) OSDs using provided devices. SSDs will be used for\n"
" journals and metadata, HDDs will be used for data. Partitions for journals and\n"
" metadata will be created automatically. SSD/HDD are found by the `rotational`\n"
" flag of devices. In hybrid mode, object size is 1 MB instead of 128 KB by\n"
" default, and journal size is 1 GB instead of 32 MB by default.\n"
" --data_device <DEV> Create a single OSD using partition <DEV> for data\n"
" --meta_device <DEV> Create a single OSD using partition <DEV> for metadata\n"
" --journal_device <DEV> Create a single OSD using partition <DEV> for journal\n"
" --journal_size 1G/32M Set journal size\n"
" --object_size 1M/128k Set blockstore object size\n"
" --disable_ssd_cache 1 Disable cache and fsyncs for SSD journal and metadata\n"
" --disable_hdd_cache 1 Disable cache and fsyncs for HDD data\n"
" --bitmap_granularity 4k Set bitmap granularity\n"
" --data_device_block 4k Override data device block size\n"
" --meta_device_block 4k Override metadata device block size\n"
" --journal_device_block 4k Override journal device block size\n"
" --meta_reserve 2x,1G\n"
" New metadata partitions in --hybrid mode are created larger than actual\n"
" metadata size to ease possible future extension. The default is to allocate\n"
" 2 times more space and at least 1G. Use this option to override.\n"
"\n"
"vitastor-disk resize <ALL_OSD_PARAMETERS> <NEW_LAYOUT> [--iodepth 32]\n"
" Resize data area and/or rewrite/move journal and metadata\n"
" ALL_OSD_PARAMETERS must include all (at least all disk-related)\n"
@ -193,8 +225,10 @@ struct disk_tool_t
int systemd_start_stop_osds(std::vector<std::string> cmd, std::vector<std::string> devices);
int pre_exec_osd(std::string device);
json11::Json read_osd_superblock(std::string device);
json11::Json read_osd_superblock(std::string device, bool expect_exist = true);
uint32_t write_osd_superblock(std::string device, json11::Json params);
int prepare(std::vector<std::string> devices);
};
void disk_tool_simple_offsets(json11::Json cfg, bool json_output);
@ -221,6 +255,10 @@ int main(int argc, char *argv[])
{
self.json = true;
}
else if (!strcmp(argv[i], "--hybrid"))
{
self.options["hybrid"] = "1";
}
else if (!strcmp(argv[i], "--help"))
{
cmd.insert(cmd.begin(), (char*)"help");
@ -1278,13 +1316,13 @@ static std::string udev_escape(std::string str)
return r;
}
static std::string realpath_str(std::string path)
static std::string realpath_str(std::string path, bool nofail = true)
{
char *p = realpath((char*)path.c_str(), NULL);
if (!p)
{
fprintf(stderr, "Failed to resolve %s: %s\n", path.c_str(), strerror(errno));
return path;
return nofail ? path : "";
}
std::string rp(p);
free(p);
@ -1380,7 +1418,7 @@ uint32_t disk_tool_t::write_osd_superblock(std::string device, json11::Json para
return sb_size;
}
json11::Json disk_tool_t::read_osd_superblock(std::string device)
json11::Json disk_tool_t::read_osd_superblock(std::string device, bool expect_exist)
{
vitastor_disk_superblock_t *sb = NULL;
uint8_t *buf = NULL;
@ -1403,14 +1441,16 @@ json11::Json disk_tool_t::read_osd_superblock(std::string device)
sb = (vitastor_disk_superblock_t*)buf;
if (sb->magic != VITASTOR_DISK_MAGIC)
{
fprintf(stderr, "Invalid OSD superblock on %s: magic number mismatch\n", device.c_str());
if (expect_exist)
fprintf(stderr, "Invalid OSD superblock on %s: magic number mismatch\n", device.c_str());
goto ex;
}
if (sb->size > VITASTOR_DISK_MAX_SB_SIZE ||
// +2 is minimal json: {}
sb->size < sizeof(vitastor_disk_superblock_t)+2)
{
fprintf(stderr, "Invalid OSD superblock on %s: invalid size\n", device.c_str());
if (expect_exist)
fprintf(stderr, "Invalid OSD superblock on %s: invalid size\n", device.c_str());
goto ex;
}
if (sb->size > 4096)
@ -1429,26 +1469,30 @@ json11::Json disk_tool_t::read_osd_superblock(std::string device)
}
if (sb->crc32c != crc32c(0, &sb->size, sb->size - ((uint8_t*)&sb->size - buf)))
{
fprintf(stderr, "Invalid OSD superblock on %s: crc32 mismatch\n", device.c_str());
if (expect_exist)
fprintf(stderr, "Invalid OSD superblock on %s: crc32 mismatch\n", device.c_str());
goto ex;
}
osd_params = json11::Json::parse(std::string((char*)sb->json_data, sb->size - sizeof(vitastor_disk_superblock_t)), json_err);
if (json_err != "")
{
fprintf(stderr, "Invalid OSD superblock on %s: invalid JSON\n", device.c_str());
if (expect_exist)
fprintf(stderr, "Invalid OSD superblock on %s: invalid JSON\n", device.c_str());
goto ex;
}
// Validate superblock
if (!osd_params["osd_num"].uint64_value())
{
fprintf(stderr, "OSD superblock on %s lacks osd_num\n", device.c_str());
osd_params = json11::Json::object{};
if (expect_exist)
fprintf(stderr, "OSD superblock on %s lacks osd_num\n", device.c_str());
osd_params = json11::Json();
goto ex;
}
if (osd_params["data_device"].string_value() == "")
{
fprintf(stderr, "OSD superblock on %s lacks data_device\n", device.c_str());
osd_params = json11::Json::object{};
if (expect_exist)
fprintf(stderr, "OSD superblock on %s lacks data_device\n", device.c_str());
osd_params = json11::Json();
goto ex;
}
real_device = realpath_str(device);
@ -1479,8 +1523,9 @@ json11::Json disk_tool_t::read_osd_superblock(std::string device)
}
else
{
fprintf(stderr, "Invalid OSD superblock on %s: does not refer to the device itself\n", device.c_str());
osd_params = json11::Json::object{};
if (expect_exist)
fprintf(stderr, "Invalid OSD superblock on %s: does not refer to the device itself\n", device.c_str());
osd_params = json11::Json();
goto ex;
}
osd_params = json11::Json::object{
@ -1560,11 +1605,28 @@ int disk_tool_t::exec_osd(std::string device)
return 0;
}
static std::string read_all_fd(int fd)
{
int res_size = 0;
std::string res;
while (1)
{
res.resize(res_size+1024);
int r = read(fd, res.data()+res_size, res.size()-res_size);
if (r > 0)
res_size += r;
else if (!r || errno != EAGAIN && errno != EINTR)
break;
}
res.resize(res_size);
return res;
}
static std::string read_file(std::string file)
{
char buf[64];
int fd = open(file.c_str(), O_RDONLY), r;
if (fd < 0 || (r = read(fd, buf, sizeof(buf))) < 0)
std::string res;
int fd = open(file.c_str(), O_RDONLY);
if (fd < 0 || (res = read_all_fd(fd)) == "")
{
if (fd >= 0)
close(fd);
@ -1572,7 +1634,7 @@ static std::string read_file(std::string file)
return "";
}
close(fd);
return std::string(buf, r);
return res;
}
static int check_queue_cache(std::string dev, std::string parent_dev)
@ -1585,22 +1647,42 @@ static int check_queue_cache(std::string dev, std::string parent_dev)
return r == "write through" ? 0 : -1;
}
// returns 1 = warning, -1 = error, 0 = success
static int disable_cache(std::string dev)
static std::string get_parent_device(std::string dev)
{
if (dev.substr(0, 5) != "/dev/")
{
fprintf(stderr, "%s is outside /dev/\n", dev.c_str());
return 1;
return "";
}
dev = dev.substr(5);
int i = dev.size();
while (i > 0 && isdigit(dev[i-1]))
i--;
// Check if it's a partition
if (i >= 2 && dev[i-1] == 'p' && isdigit(dev[i-2]))
if (i >= 1 && dev[i-1] == '-') // dm-0, dm-1
return dev;
else if (i >= 2 && dev[i-1] == 'p' && isdigit(dev[i-2])) // nvme0n1p1
i--;
auto parent_dev = dev.substr(0, i);
// Check that such block device exists
struct stat st;
auto chk = "/sys/block/"+dev.substr(0, i);
if (stat(chk.c_str(), &st) < 0)
{
if (errno != ENOENT)
{
fprintf(stderr, "Failed to stat %s: %s\n", chk.c_str(), strerror(errno));
return "";
}
return dev;
}
return dev.substr(0, i);
}
// returns 1 = warning, -1 = error, 0 = success
static int disable_cache(std::string dev)
{
auto parent_dev = get_parent_device(dev);
if (parent_dev == "")
return 1;
auto scsi_disk = "/sys/block/"+parent_dev+"/device/scsi_disk";
DIR *dir = opendir(scsi_disk.c_str());
if (!dir)
@ -1717,3 +1799,124 @@ int disk_tool_t::pre_exec_osd(std::string device)
}
return 0;
}
static int shell_exec(const std::vector<std::string> & cmd, const std::string & in, std::string *out, std::string *err)
{
int child_stdin[2], child_stdout[2], child_stderr[2];
pid_t pid;
if (pipe(child_stdin) == -1)
goto err_pipe1;
if (pipe(child_stdout) == -1)
goto err_pipe2;
if (pipe(child_stderr) == -1)
goto err_pipe3;
if ((pid = fork()) == -1)
goto err_fork;
if (pid)
{
// Parent
// We should do select() to do something serious, but this is for simple cases
close(child_stdin[0]);
close(child_stdout[1]);
close(child_stderr[1]);
write_blocking(child_stdin[1], (void*)in.data(), in.size());
close(child_stdin[1]);
std::string s;
s = read_all_fd(child_stdout[0]);
if (out)
out->swap(s);
close(child_stdout[0]);
s = read_all_fd(child_stderr[0]);
if (err)
err->swap(s);
close(child_stderr[0]);
int wstatus = 0;
waitpid(pid, &wstatus, 0);
return WEXITSTATUS(wstatus);
}
else
{
// Child
dup2(child_stdin[0], 0);
dup2(child_stdout[1], 1);
close(child_stdin[0]);
close(child_stdin[1]);
close(child_stdout[0]);
close(child_stdout[1]);
//char *argv[] = { (char*)"/bin/sh", (char*)"-c", (char*)cmd.c_str(), NULL };
char *argv[cmd.size()+1];
for (int i = 0; i < cmd.size(); i++)
{
argv[i] = (char*)cmd[i].c_str();
}
argv[cmd.size()-1] = NULL;
execv(argv[0], argv);
std::string full_cmd;
for (int i = 0; i < cmd.size(); i++)
{
full_cmd += cmd[i];
full_cmd += " ";
}
full_cmd.resize(full_cmd.size() > 0 ? full_cmd.size()-1 : 0);
fprintf(stderr, "error running %s: %s", full_cmd.c_str(), strerror(errno));
exit(1);
}
err_fork:
close(child_stderr[1]);
close(child_stderr[0]);
err_pipe3:
close(child_stdout[1]);
close(child_stdout[0]);
err_pipe2:
close(child_stdin[1]);
close(child_stdin[0]);
err_pipe1:
return 1;
}
int disk_tool_t::prepare(std::vector<std::string> devices)
{
if (options.find("data_device") != options.end() && options["data_device"] != "")
{
if (options.find("hybrid") != options.end() || devices.size())
{
fprintf(stderr, "Device list (positional arguments) and --hybrid are incompatible with --data_device\n");
return 1;
}
if (options.find("force") == options.end())
{
for (auto dev: std::vector<std::string>{ options["data_device"], options["meta_device"], options["journal_device"] })
{
if (dev == "")
continue;
std::string real_dev = realpath_str(dev, false);
if (real_dev == "")
return 1;
std::string parent_dev = get_parent_device(real_dev);
if (parent_dev == "")
return 1;
if (parent_dev == real_dev)
{
fprintf(stderr, "%s is not a partition, not creating OSD without --force\n", dev.c_str());
return 1;
}
std::string out;
if (shell_exec({ "/sbin/blkid", "-p", dev }, "", &out, NULL))
{
fprintf(stderr, "%s contains data, not creating OSD without --force. blkid -p says:\n%s", dev.c_str(), out.c_str());
return 1;
}
json11::Json sb = read_osd_superblock(dev, false);
if (!sb.is_null())
{
fprintf(stderr, "%s already contains Vitastor OSD superblock, not creating OSD without --force\n", dev.c_str());
return 1;
}
}
}
// Calculate offsets if the same device is used for two or more of data, meta, and journal
//write_osd_superblock(dev, );
}
return 0;
}

Loading…
Cancel
Save