Compare commits

...

2 Commits

  1. 2
      mon/90-vitastor.rules
  2. 65
      mon/upgrade-simple.js
  3. 9
      src/disk_tool.cpp
  4. 2
      src/disk_tool.h
  5. 10
      src/disk_tool_udev.cpp
  6. 11
      src/disk_tool_utils.cpp
  7. 2
      src/lrc/Makefile
  8. 291
      src/lrc/mat.c

2
mon/90-vitastor.rules

@ -2,3 +2,5 @@ SUBSYSTEM=="block", ENV{ID_PART_ENTRY_TYPE}=="e7009fac-a5a1-4d72-af72-53de130599
OWNER="vitastor", GROUP="vitastor", \
IMPORT{program}="/usr/bin/vitastor-disk udev $devnode", \
SYMLINK+="vitastor/$env{VITASTOR_ALIAS}"
ENV{VITASTOR_OSD_NUM}!="", RUN{program}+="/usr/bin/systemctl enable --now vitastor-osd@$env{VITASTOR_OSD_NUM}"

65
mon/upgrade-simple.js

@ -2,6 +2,7 @@
// Upgrade tool for OSD units generated with make-osd.sh and make-osd-hybrid.js
const fsp = require('fs').promises;
const child_process = require('child_process');
upgrade_osd(process.argv[2]).catch(e =>
{
@ -19,10 +20,10 @@ async function upgrade_osd(unit)
service_name = service_name[1];
// Parse the unit
const text = await fsp.readFile(unit, { encoding: 'utf-8' });
let cmd = /\nExecStart\s*=[^\n]+vitastor-osd\s*(([^\n&>\d]+|\\[ \t\r]*\n|\d[^>])+)/.exec(text);
let cmd = /\nExecStart\s*=[^\n]+vitastor-osd\s*(([^\\\n&>\d]+|\\[ \t\r]*\n|\d[^>])+)/.exec(text);
if (!cmd)
throw new Error('Failed to extract ExecStart command from '+unit);
cmd = cmd[1].replace(/\\[ \t\r]*\n/g, '');
cmd = cmd[1].replace(/\\[ \t\r]*\n/g, '').split(/\s+/);
const options = {};
for (let i = 0; i < cmd.length-1; i += 2)
{
@ -43,7 +44,7 @@ async function upgrade_osd(unit)
);
}
// Stop and disable the service
system_or_die("systemctl disable --now "+service_name);
await system_or_die("systemctl disable --now "+service_name);
const j_o = BigInt(options['journal_offset'] || 0);
const m_o = BigInt(options['meta_offset'] || 0);
const d_o = BigInt(options['data_offset'] || 0);
@ -72,36 +73,35 @@ async function upgrade_osd(unit)
if (!j_is_d && !j_is_m && j_o < 4096)
resize.new_journal_offset = j_o+4096n;
const resize_opts = Object.keys(resize).map(k => ` --${k} ${resize[k]}`).join('');
console.log('Resize options:'+resize_opts);
await system_or_die(
'vitastor-disk resize'+
Object.keys(options).map(k => ` --${k} ${options[k]}`).join('')+resize_opts
);
const resize_cmd = 'vitastor-disk resize'+
Object.keys(options).map(k => ` --${k} ${options[k]}`).join('')+resize_opts;
await system_or_die(resize_cmd, { no_cmd_on_err: true });
for (let k in resize)
options[k.substr(4)] = resize[k];
options[k.substr(4)] = ''+resize[k];
}
// Write superblock
const sb = JSON.stringify(options);
await system_or_die('vitastor-disk write-sb '+options['data_device'], sb);
await system_or_die('vitastor-disk write-sb '+options['data_device'], { input: sb });
if (!m_is_d)
await system_or_die('vitastor-disk write-sb '+options['meta_device'], sb);
await system_or_die('vitastor-disk write-sb '+options['meta_device'], { input: sb });
if (!j_is_d && !j_is_m)
await system_or_die('vitastor-disk write-sb '+options['journal_device'], sb);
await system_or_die('vitastor-disk write-sb '+options['journal_device'], { input: sb });
// Change partition type
fix_partition_type(options['data_device']);
await fix_partition_type(options['data_device']);
if (!m_is_d)
fix_partition_type(options['meta_device']);
await fix_partition_type(options['meta_device']);
if (!j_is_d && !j_is_m)
fix_partition_type(options['journal_device']);
await fix_partition_type(options['journal_device']);
// Enable the new unit
system_or_die("systemctl enable --now vitastor-osd@"+options['osd_num']);
await system_or_die("systemctl enable --now vitastor-osd@"+options['osd_num']);
console.log('\nOK: Converted OSD '+options['osd_num']+' to the new scheme. The new service name is vitastor-osd@'+options['osd_num']);
}
async function fix_partition_type(dev)
{
const uuid = dev.replace(/^.*\//, '').toLowerCase();
const parent_dev = (await fsp.realpath(dev)).replace(/((\d)p|(\D))?\d+$/, '$2$3');
const pt = JSON.parse(await system_or_die('sfdisk --dump '+parent_dev+' --json')).partitiontable;
const pt = JSON.parse(await system_or_die('sfdisk --dump '+parent_dev+' --json', { get_out: true })).partitiontable;
let script = 'label: gpt\n\n';
for (const part of pt.partitions)
{
@ -109,33 +109,34 @@ async function fix_partition_type(dev)
part.type = 'e7009fac-a5a1-4d72-af72-53de13059903';
script += part.node+': '+Object.keys(part).map(k => k == 'node' ? '' : k+'='+part[k]).filter(k => k).join(', ')+'\n';
}
await system_or_die('sfdisk '+dev.path, script);
await system_or_die('sfdisk --force '+parent_dev, { input: script, get_out: true });
}
async function system_or_die(cmd, input = '')
async function system_or_die(cmd, options = {})
{
let [ exitcode, stdout, stderr ] = await system(cmd, input);
let [ exitcode, stdout, stderr ] = await system(cmd, options);
if (exitcode != 0)
throw new Error(cmd+' failed: '+stderr);
throw new Error((!options.no_cmd_on_err ? cmd : 'Command')+' failed'+(options.get_err ? ': '+stderr : ''));
return stdout;
}
async function system(cmd, input = '')
async function system(cmd, options = {})
{
if (options.debug)
{
process.stderr.write('+ '+cmd+(input ? " <<EOF\n"+input.replace(/\s*$/, '\n')+"EOF" : '')+'\n');
}
const cp = child_process.spawn(cmd, { shell: true });
process.stderr.write('Running: '+cmd+(options.input != null ? " <<EOF\n"+options.input.replace(/\s*$/, '\n')+"EOF" : '')+'\n');
const cp = child_process.spawn(cmd, {
shell: true,
stdio: [ 'pipe', options.get_out ? 'pipe' : 1, options.get_err ? 'pipe' : 1 ],
});
let stdout = '', stderr = '', finish_cb;
cp.stdout.on('data', buf => stdout += buf.toString());
cp.stderr.on('data', buf => stderr += buf.toString());
if (options.get_out)
cp.stdout.on('data', buf => stdout += buf.toString());
if (options.get_err)
cp.stderr.on('data', buf => stderr += buf.toString());
cp.on('exit', () => finish_cb && finish_cb());
cp.stdin.write(input);
if (options.input != null)
cp.stdin.write(options.input);
cp.stdin.end();
if (cp.exitCode == null)
{
await new Promise(ok => finish_cb = ok);
}
return [ cp.exitCode, stdout, stderr ];
}

9
src/disk_tool.cpp

@ -254,6 +254,15 @@ int main(int argc, char *argv[])
}
return self.exec_osd(cmd[1]);
}
else if (!strcmp(cmd[0], "pre-exec"))
{
if (cmd.size() != 2)
{
fprintf(stderr, "Exactly 1 device path argument is required\n");
return 1;
}
return self.pre_exec_osd(cmd[1]);
}
else
{
print_help(help_text, "vitastor-disk", cmd.size() > 1 ? cmd[1] : "", self.all);

2
src/disk_tool.h

@ -126,7 +126,7 @@ void disk_tool_simple_offsets(json11::Json cfg, bool json_output);
std::string realpath_str(std::string path, bool nofail = true);
std::string read_all_fd(int fd);
std::string read_file(std::string file);
std::string read_file(std::string file, bool allow_enoent = false);
int check_queue_cache(std::string dev, std::string parent_dev);
std::string get_parent_device(std::string dev);
bool json_is_true(const json11::Json & val);

10
src/disk_tool_udev.cpp

@ -43,7 +43,7 @@ int disk_tool_t::udev_import(std::string device)
uint64_t osd_num = sb["params"]["osd_num"].uint64_value();
// Print variables for udev
printf("VITASTOR_OSD_NUM=%lu\n", osd_num);
printf("VITASTOR_ALIAS=osd%lu%s\n", osd_num, sb["device_type"].string_value().c_str());
printf("VITASTOR_ALIAS=osd%lu-%s\n", osd_num, sb["device_type"].string_value().c_str());
printf("VITASTOR_DATA_DEVICE=%s\n", udev_escape(sb["params"]["data_device"].string_value()).c_str());
if (sb["real_meta_device"].string_value() != "" && sb["real_meta_device"] != sb["real_data_device"])
printf("VITASTOR_META_DEVICE=%s\n", udev_escape(sb["params"]["meta_device"].string_value()).c_str());
@ -201,9 +201,9 @@ json11::Json disk_tool_t::read_osd_superblock(std::string device, bool expect_ex
}
real_device = realpath_str(device);
real_data = realpath_str(osd_params["data_device"].string_value());
real_meta = osd_params["meta_device"] != "" && osd_params["meta_device"] != osd_params["data_device"]
real_meta = osd_params["meta_device"].string_value() != "" && osd_params["meta_device"] != osd_params["data_device"]
? realpath_str(osd_params["meta_device"].string_value()) : "";
real_journal = osd_params["journal_device"] != "" && osd_params["journal_device"] != osd_params["meta_device"]
real_journal = osd_params["journal_device"].string_value() != "" && osd_params["journal_device"] != osd_params["meta_device"]
? realpath_str(osd_params["journal_device"].string_value()) : "";
if (real_journal == real_meta)
{
@ -322,7 +322,7 @@ static int disable_cache(std::string dev)
if (errno == ENOENT)
{
// Not a SCSI/SATA device, just check /sys/block/.../queue/write_cache
return check_queue_cache(dev, parent_dev);
return check_queue_cache(dev.substr(5), parent_dev);
}
else
{
@ -339,7 +339,7 @@ static int disable_cache(std::string dev)
{
// Not a SCSI/SATA device, just check /sys/block/.../queue/write_cache
closedir(dir);
return check_queue_cache(dev, parent_dev);
return check_queue_cache(dev.substr(5), parent_dev);
}
scsi_disk += "/";
scsi_disk += de->d_name;

11
src/disk_tool_utils.cpp

@ -5,6 +5,7 @@
#include "disk_tool.h"
#include "rw_blocking.h"
#include "str_util.h"
std::string realpath_str(std::string path, bool nofail)
{
@ -36,15 +37,17 @@ std::string read_all_fd(int fd)
return res;
}
std::string read_file(std::string file)
std::string read_file(std::string file, bool allow_enoent)
{
std::string res;
int fd = open(file.c_str(), O_RDONLY);
if (fd < 0 || (res = read_all_fd(fd)) == "")
{
int err = errno;
if (fd >= 0)
close(fd);
fprintf(stderr, "Can't read %s: %s\n", file.c_str(), strerror(errno));
if (!allow_enoent || err != ENOENT)
fprintf(stderr, "Can't read %s: %s\n", file.c_str(), strerror(err));
return "";
}
close(fd);
@ -53,12 +56,12 @@ std::string read_file(std::string file)
int check_queue_cache(std::string dev, std::string parent_dev)
{
auto r = read_file("/sys/block/"+dev+"/queue/write_cache");
auto r = read_file("/sys/block/"+dev+"/queue/write_cache", true);
if (r == "")
r = read_file("/sys/block/"+parent_dev+"/queue/write_cache");
if (r == "")
return 1;
return r == "write through" ? 0 : -1;
return trim(r) == "write through" ? 0 : -1;
}
std::string get_parent_device(std::string dev)

2
src/lrc/Makefile

@ -0,0 +1,2 @@
mat: mat.c
gcc -O3 -I/usr/include/jerasure -o mat mat.c -lJerasure

291
src/lrc/mat.c

@ -0,0 +1,291 @@
#include <jerasure/reed_sol.h>
#include <jerasure.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
// Generate LRC matrix: (groups*local + global) code rows with (data_drives) columns
// w should be >= log2(data_drives + groups*local + global), but not necessary 8/16/32
int* reed_sol_vandermonde_lrc_matrix(int data_drives, int groups, int local, int global, int w)
{
if (w < 0 || w > 32 || data_drives + groups*local + global > (1<<w))
{
return NULL;
}
int *lrc_matrix = (int*)malloc(sizeof(int) * (local*groups+global));
int *matrix = reed_sol_vandermonde_coding_matrix(data_drives, local+global, w);
// Enough to transform LRC 8+2+2 GF(8) matrix into MR-LRC
//for (int i = 0; i < local+global; i++)
//{
// int t = matrix[i*data_drives + 3];
// matrix[i*data_drives + 3] = matrix[i*data_drives + 7];
// matrix[i*data_drives + 7] = t;
//}
for (int gr = 0; gr < groups; gr++)
{
for (int l = 0; l < local; l++)
{
for (int j = 0; j < data_drives; j++)
{
lrc_matrix[(gr*local+l)*data_drives + j] = (j / (data_drives/groups)) == gr ? matrix[l*data_drives + j] : 0;
}
}
}
for (int i = 0; i < global; i++)
{
for (int j = 0; j < data_drives; j++)
{
lrc_matrix[(groups*local+i)*data_drives + j] = matrix[(local+i)*data_drives + j];
}
}
free(matrix);
return lrc_matrix;
}
struct lrc_test_result_t
{
int success, impossible, failures;
};
// Check if the generated LRC with given parameters is Maximally Reconstructible (MR-LRC)
// Example of a MR-LRC: (8, 2, 1, 2, 6, 8)
struct lrc_test_result_t check_mr_lrc(int *lrc_matrix, int data_drives, int groups, int local, int global, int w, int log_level)
{
int n = data_drives;
int total_rows = n + groups*local + global;
int impossible = 0, success = 0, failures = 0;
int *lost_per_group = (int*)malloc(sizeof(int) * groups);
int *recovered_per_group = (int*)malloc(sizeof(int) * groups);
int *selected_inverted = (int*)malloc(sizeof(int) * data_drives);
// global+1 is always recoverable
for (int lost = global+2; lost <= groups*local+global; lost++)
{
int *erased_matrix = (int*)malloc(sizeof(int) * (total_rows-lost)*n);
int *inverted_matrix = (int*)malloc(sizeof(int) * (total_rows-lost)*n);
int *p = (int*)malloc(sizeof(int) * (total_rows-lost));
for (int i = 0; i < n; i++)
p[i] = i;
int *p2 = (int*)malloc(sizeof(int) * n);
if (total_rows-lost > n)
{
p[n-1] = n; // skip combinations with all N data disks (0..n-1)
for (int i = n; i < total_rows-lost; i++)
p[i] = i+1;
p[total_rows-lost-1]--; // will be incremented on the first step
}
int inc = total_rows-lost-1;
while (1)
{
p[inc]++;
if (p[inc] >= n+groups*local+global)
{
if (inc == 0)
break;
inc--;
}
else if (inc+1 < total_rows-lost)
{
p[inc+1] = p[inc];
inc++;
}
else
{
// Check if it should be recoverable
// Calculate count of data chunks lost in each group
int nsel = 0;
for (int gr = 0; gr < groups; gr++)
{
lost_per_group[gr] = ((gr+1)*(n/groups) > n ? (n - gr*(n/groups)) : n/groups);
recovered_per_group[gr] = 0;
}
for (int j = 0; j < total_rows-lost; j++)
{
if (p[j] < n)
{
lost_per_group[(p[j] / (n/groups))]--;
selected_inverted[nsel++] = j;
}
}
// Every local parity chunk is supposed to restore 1 missing chunk inside its group
// So, subtract local parity chunk counts from each group lost chunk count
for (int j = 0; j < total_rows-lost; j++)
{
if (p[j] >= n && p[j] < n+groups*local)
{
int gr = (p[j]-n)/local;
if (lost_per_group[gr] > recovered_per_group[gr] && nsel < n)
{
selected_inverted[nsel++] = j;
}
recovered_per_group[gr]++;
}
}
// Every global parity chunk is supposed to restore 1 chunk of all that are still missing
int still_missing = 0;
for (int gr = 0; gr < groups; gr++)
{
int non_fixed = lost_per_group[gr] - recovered_per_group[gr];
still_missing += (non_fixed > 0 ? non_fixed : 0);
}
for (int j = 0; j < total_rows-lost; j++)
{
if (p[j] >= n+groups*local)
{
if (still_missing > 0 && nsel < n)
{
selected_inverted[nsel++] = j;
}
still_missing--;
}
}
if (still_missing <= 0)
{
// We hope it can be recoverable. Try to invert it
assert(nsel == n);
for (int i = 0; i < n; i++)
{
for (int j = 0; j < n; j++)
{
erased_matrix[i*n+j] = lrc_matrix[p[selected_inverted[i]]*n+j];
}
}
int invert_ok = jerasure_invert_matrix(erased_matrix, inverted_matrix, n, w);
if (invert_ok < 0)
{
failures++;
if (log_level > 0)
{
printf("\nFAIL: ");
for (int i = 0; i < total_rows-lost; i++)
{
printf("%d ", p[i]);
}
printf("\nDIRECT:\n");
for (int i = 0; i < total_rows-lost; i++)
{
for (int j = 0; j < n; j++)
printf("%d ", lrc_matrix[p[i]*n+j]);
printf("\n");
}
printf("INVERSE:\n");
for (int i = 0; i < total_rows-lost; i++)
{
for (int j = 0; j < n; j++)
printf("%d ", inverted_matrix[i*n+j]);
printf("\n");
}
}
}
else
{
success++;
if (log_level > 2)
{
printf("OK: ");
for (int i = 0; i < total_rows-lost; i++)
{
printf("%d ", p[i]);
}
printf("\n");
}
}
}
else
{
impossible++;
if (log_level > 1)
{
printf("IMPOSSIBLE: ");
for (int i = 0; i < total_rows-lost; i++)
{
printf("%d ", p[i]);
}
printf("\n");
}
}
}
}
free(p2);
free(p);
free(inverted_matrix);
free(erased_matrix);
}
free(lost_per_group);
free(recovered_per_group);
return (struct lrc_test_result_t){
.success = success,
.impossible = impossible,
.failures = failures,
};
}
int main()
{
int W = 8, MATRIX_W = 8;
int n = 8, groups = 2, local = 1, global = 2;
//n = 4, groups = 2, local = 1, global = 1;
int total_rows = n+groups*local+global;
int *matrix = reed_sol_vandermonde_lrc_matrix(n, groups, local, global, MATRIX_W);
int *lrc_matrix = (int*)malloc(sizeof(int) * total_rows*n);
// Fill identity+LRC matrix
for (int i = 0; i < n; i++)
for (int j = 0; j < n; j++)
lrc_matrix[i*n + j] = j == i ? 1 : 0;
memcpy(lrc_matrix + n*n, matrix, (total_rows-n)*n*sizeof(int));
free(matrix);
matrix = NULL;
// Print LRC matrix
for (int i = 0; i < total_rows; i++)
{
for (int j = 0; j < n; j++)
{
printf("%d ", lrc_matrix[i*n+j]);
}
printf("\n");
}
struct lrc_test_result_t t = check_mr_lrc(lrc_matrix, n, groups, local, global, W, 1);
printf("\n%d recovered, %d impossible, %d failures\n", t.success, t.impossible, t.failures);
return 0;
}
// 1 1 1 1 0 0 0 0
// 0 0 0 0 1 1 1 1
// 1 55 39 73 84 181 225 217
// 1 172 70 235 143 34 200 101
//
// Can't recover
// 1 2 4 5 8 9 10 11 -1
// 2 3 4 6 8 9 10 11 -1
// FULL:
// 1 0 0 0 0 0 0 0
// 0 1 0 0 0 0 0 0
// 0 0 1 0 0 0 0 0
// 0 0 0 1 0 0 0 0
// 0 0 0 0 1 0 0 0
// 0 0 0 0 0 1 0 0
// 0 0 0 0 0 0 1 0
// 0 0 0 0 0 0 0 1
// 1 1 1 1 0 0 0 0
// 0 0 0 0 1 1 1 1
// 1 55 39 73 84 181 225 217
// 1 172 70 235 143 34 200 101
// FIRST UNRECOVERABLE:
// 0 1 0 0 0 0 0 0
// 0 0 1 0 0 0 0 0
// 0 0 0 0 1 0 0 0
// 0 0 0 0 0 1 0 0
// 1 1 1 1 0 0 0 0
// 0 0 0 0 1 1 1 1
// 1 55 39 73 84 181 225 217
// 1 172 70 235 143 34 200 101
// SECOND UNRECOVERABLE:
// 0 0 1 0 0 0 0 0
// 0 0 0 1 0 0 0 0
// 0 0 0 0 1 0 0 0
// 0 0 0 0 0 0 1 0
// 1 1 1 1 0 0 0 0
// 0 0 0 0 1 1 1 1
// 1 55 39 73 84 181 225 217
// 1 172 70 235 143 34 200 101
// Ho ho ho
Loading…
Cancel
Save