parent
d289753df4
commit
65d08e067e
@ -0,0 +1,414 @@ |
||||
#!/usr/bin/nodejs
|
||||
// systemd unit generator for hybrid (HDD+SSD) vitastor OSDs
|
||||
// Copyright (c) Vitaliy Filippov, 2019+
|
||||
// License: VNPL-1.1
|
||||
|
||||
// USAGE: nodejs make-osd-hybrid.js [--disable_ssd_cache 0] [--disable_hdd_cache 0] /dev/sda /dev/sdb /dev/sdc /dev/sdd ...
|
||||
// I.e. - just pass all HDDs and SSDs mixed, the script will decide where
|
||||
// to put journals on its own
|
||||
|
||||
const fs = require('fs'); |
||||
const fsp = fs.promises; |
||||
const child_process = require('child_process'); |
||||
|
||||
const options = { |
||||
debug: 1, |
||||
journal_size: 1024*1024*1024, |
||||
min_meta_size: 1024*1024*1024, |
||||
object_size: 1024*1024, |
||||
bitmap_granularity: 4096, |
||||
device_block_size: 4096, |
||||
disable_ssd_cache: 1, |
||||
disable_hdd_cache: 1, |
||||
}; |
||||
|
||||
run().catch(console.fatal); |
||||
|
||||
async function run() |
||||
{ |
||||
const device_list = parse_options(); |
||||
await system_or_die("mkdir -p /var/log/vitastor; chown vitastor /var/log/vitastor"); |
||||
// Collect devices
|
||||
const all_devices = await collect_devices(device_list); |
||||
const ssds = all_devices.filter(d => d.ssd); |
||||
const hdds = all_devices.filter(d => !d.ssd); |
||||
// Collect existing OSD units
|
||||
const osd_units = await collect_osd_units(); |
||||
// Count assigned HDD journals and unallocated space for each SSD
|
||||
await check_journal_count(ssds, osd_units); |
||||
// Create new OSDs
|
||||
await create_new_hybrid_osds(hdds, ssds, osd_units); |
||||
process.exit(0); |
||||
} |
||||
|
||||
function parse_options() |
||||
{ |
||||
const devices = []; |
||||
const opt = {}; |
||||
for (let i = 2; i < process.argv.length; i++) |
||||
{ |
||||
const arg = process.argv[i]; |
||||
if (arg == '--help' || arg == '-h') |
||||
{ |
||||
opt.help = true; |
||||
break; |
||||
} |
||||
else if (arg.substr(0, 2) == '--') |
||||
opt[arg.substr(2)] = process.argv[++i]; |
||||
else |
||||
devices.push(arg); |
||||
} |
||||
if (opt.help || !devices.length) |
||||
{ |
||||
console.log( |
||||
'Prepare hybrid (HDD+SSD) Vitastor OSDs\n'+ |
||||
'(c) Vitaliy Filippov, 2019+, license: VNPL-1.1\n\n'+ |
||||
'USAGE: nodejs make-osd-hybrid.js [OPTIONS] /dev/sda /dev/sdb /dev/sdc ...\n'+ |
||||
'Just pass all your SSDs and HDDs in any order, the script will distribute OSDs for you.\n\n'+ |
||||
'OPTIONS (with defaults):\n'+ |
||||
Object.keys(options).map(k => ` --${k} ${options[k]}`).join('\n') |
||||
); |
||||
process.exit(0); |
||||
} |
||||
for (const k in opt) |
||||
options[k] = opt[k]; |
||||
return devices; |
||||
} |
||||
|
||||
// Collect devices
|
||||
async function collect_devices(devices_to_check) |
||||
{ |
||||
const devices = []; |
||||
for (const dev of devices_to_check) |
||||
{ |
||||
if (dev.substr(0, 5) != '/dev/') |
||||
{ |
||||
console.log(`${dev} does not start with /dev/, skipping`); |
||||
continue; |
||||
} |
||||
if (!await file_exists('/sys/block/'+dev.substr(5))) |
||||
{ |
||||
console.log(`${dev} is a partition, skipping`); |
||||
continue; |
||||
} |
||||
// Check if the device is an SSD
|
||||
const rot = '/sys/block/'+dev.substr(5)+'/queue/rotational'; |
||||
if (!await file_exists(rot)) |
||||
{ |
||||
console.log(`${dev} does not have ${rot} to check whether it's an SSD, skipping`); |
||||
continue; |
||||
} |
||||
const ssd = !parseInt(await fsp.readFile(rot, { encoding: 'utf-8' })); |
||||
// Check if the device has partition table
|
||||
let [ has_partition_table, parts ] = await system(`sfdisk --dump ${dev} --json`); |
||||
if (has_partition_table != 0) |
||||
{ |
||||
// Check if the device has any data
|
||||
const [ has_data, out ] = await system(`blkid ${dev}`); |
||||
if (has_data == 0) |
||||
{ |
||||
console.log(`${dev} contains data, skipping:\n ${out.trim().replace(/\n/g, '\n ')}`); |
||||
continue; |
||||
} |
||||
} |
||||
parts = parts ? JSON.parse(parts).partitiontable : null; |
||||
if (parts && parts.label != 'gpt') |
||||
{ |
||||
console.log(`${dev} contains "${parts.label}" partition table, only GPT is supported, skipping`); |
||||
continue; |
||||
} |
||||
devices.push({ |
||||
path: dev, |
||||
ssd, |
||||
parts, |
||||
}); |
||||
} |
||||
return devices; |
||||
} |
||||
|
||||
// Collect existing OSD units
|
||||
async function collect_osd_units() |
||||
{ |
||||
const units = []; |
||||
for (const unit of (await system("ls /etc/systemd/system/vitastor-osd*.service"))[1].trim().split('\n')) |
||||
{ |
||||
if (!unit) |
||||
{ |
||||
continue; |
||||
} |
||||
let cmd = /^ExecStart\s*=\s*(([^\n]*\\\n)*[^\n]*)/.exec(await fsp.readFile(unit, { encoding: 'utf-8' })); |
||||
if (!cmd) |
||||
{ |
||||
console.log('ExecStart= not found in '+unit+', skipping') |
||||
continue; |
||||
} |
||||
let kv = {}, key; |
||||
cmd = cmd[1].replace(/^bash\s+-c\s+'/, '') |
||||
.replace(/>>\s*\S+2>\s*&1\s*'$/, '') |
||||
.replace(/\s*\\\n\s*/g, ' ') |
||||
.replace(/([^\s']+)|'([^']+)'/g, (m, m1, m2) => |
||||
{ |
||||
m1 = m1||m2; |
||||
if (key == null) |
||||
{ |
||||
if (m1.substr(0, 2) != '--') |
||||
{ |
||||
console.log('Strange command line in '+unit+', stopping'); |
||||
process.exit(1); |
||||
} |
||||
key = m1.substr(2); |
||||
} |
||||
else |
||||
{ |
||||
kv[key] = m1; |
||||
key = null; |
||||
} |
||||
}); |
||||
units.push(kv); |
||||
} |
||||
return units; |
||||
} |
||||
|
||||
// Count assigned HDD journals and unallocated space for each SSD
|
||||
async function check_journal_count(ssds, osd_units) |
||||
{ |
||||
const units_by_journal = osd_units.reduce((a, c) => |
||||
{ |
||||
if (c.journal_device) |
||||
a[c.journal_device] = c; |
||||
return a; |
||||
}, {}); |
||||
for (const dev of ssds) |
||||
{ |
||||
dev.journals = 0; |
||||
if (dev.parts) |
||||
{ |
||||
for (const part of dev.parts.partitions) |
||||
{ |
||||
if (part.uuid && units_by_journal['/dev/disk/by-partuuid/'+part.uuid.toLowerCase()]) |
||||
{ |
||||
dev.journals++; |
||||
} |
||||
} |
||||
dev.free = free_from_parttable(dev.parts); |
||||
} |
||||
else |
||||
{ |
||||
dev.free = parseInt(await system_or_die("blockdev --getsize64 "+dev.path)); |
||||
} |
||||
} |
||||
} |
||||
|
||||
async function create_new_hybrid_osds(hdds, ssds, osd_units) |
||||
{ |
||||
const units_by_disk = osd_units.reduce((a, c) => { a[c.data_device] = c; return a; }, {}); |
||||
for (const dev of hdds) |
||||
{ |
||||
if (!dev.parts) |
||||
{ |
||||
// HDD is not partitioned yet, create a single partition
|
||||
// + is the "default value" for sfdisk
|
||||
await system_or_die('sfdisk '+dev.path, 'label: gpt\n\n+ +\n'); |
||||
dev.parts = JSON.parse(await system_or_die('sfdisk --dump '+dev.path+' --json')).partitiontable; |
||||
} |
||||
if (dev.parts.partitions.length != 1) |
||||
{ |
||||
console.log(dev.path+' has more than 1 partition, skipping'); |
||||
} |
||||
else if ((dev.parts.partitions[0].start + dev.parts.partitions[0].size) != (1 + dev.parts.lastlba)) |
||||
{ |
||||
console.log(dev.path+'1 is not a whole-disk partition, skipping'); |
||||
} |
||||
else if (!dev.parts.partitions[0].uuid) |
||||
{ |
||||
console.log(dev.parts.partitions[0].node+' does not have UUID. Please repartition '+dev.path+' with GPT'); |
||||
} |
||||
else if (!units_by_disk['/dev/disk/by-partuuid/'+dev.parts.partitions[0].uuid.toLowerCase()]) |
||||
{ |
||||
await create_hybrid_osd(dev, ssds); |
||||
} |
||||
} |
||||
} |
||||
|
||||
async function create_hybrid_osd(dev, ssds) |
||||
{ |
||||
// Create a new OSD
|
||||
// Calculate metadata size
|
||||
const data_device = '/dev/disk/by-partuuid/'+dev.parts.partitions[0].uuid.toLowerCase(); |
||||
const data_size = dev.parts.partitions[0].size * dev.parts.sectorsize; |
||||
const meta_entry_size = 24 + 2*options.object_size/options.bitmap_granularity/8; |
||||
const entries_per_block = Math.floor(options.device_block_size / meta_entry_size); |
||||
const object_count = Math.floor(data_size / options.object_size); |
||||
let meta_size = Math.ceil(1 + object_count / entries_per_block) * options.device_block_size; |
||||
// Leave some extra space for future metadata formats and round metadata area size to multiples of 1 MB
|
||||
meta_size = 2*meta_size; |
||||
meta_size = Math.ceil(meta_size/1024/1024) * 1024*1024; |
||||
if (meta_size < options.min_meta_size) |
||||
meta_size = options.min_meta_size; |
||||
let journal_size = Math.ceil(options.journal_size/1024/1024) * 1024*1024; |
||||
// Pick an SSD for journal, balancing the number of journals across SSDs
|
||||
let selected_ssd; |
||||
for (const ssd of ssds) |
||||
if (ssd.free >= (meta_size+journal_size) && (!selected_ssd || selected_ssd.journals > ssd.journals)) |
||||
selected_ssd = ssd; |
||||
if (!selected_ssd) |
||||
{ |
||||
console.error('Could not find free space for SSD journal and metadata for '+dev.path); |
||||
process.exit(1); |
||||
} |
||||
// Allocate an OSD number
|
||||
const osd_num = (await system_or_die("vitastor-cli alloc-osd")).trim(); |
||||
if (!osd_num) |
||||
{ |
||||
console.error('Failed to run vitastor-cli alloc-osd'); |
||||
process.exit(1); |
||||
} |
||||
console.log('Creating OSD '+osd_num+' on '+dev.path+' (HDD) with journal and metadata on '+selected_ssd.path+' (SSD)'); |
||||
// Add two partitions: journal and metadata
|
||||
const new_parts = await add_partitions(selected_ssd, [ journal_size, meta_size ]); |
||||
selected_ssd.journals++; |
||||
const journal_device = '/dev/disk/by-partuuid/'+new_parts[0].uuid.toLowerCase(); |
||||
const meta_device = '/dev/disk/by-partuuid/'+new_parts[1].uuid.toLowerCase(); |
||||
// Wait until the device symlinks appear
|
||||
while (!await file_exists(journal_device)) |
||||
{ |
||||
await new Promise(ok => setTimeout(ok, 100)); |
||||
} |
||||
while (!await file_exists(meta_device)) |
||||
{ |
||||
await new Promise(ok => setTimeout(ok, 100)); |
||||
} |
||||
// Zero out metadata and journal
|
||||
await system_or_die("dd if=/dev/zero of="+journal_device+" bs=1M count="+(journal_size/1024/1024)+" oflag=direct"); |
||||
await system_or_die("dd if=/dev/zero of="+meta_device+" bs=1M count="+(meta_size/1024/1024)+" oflag=direct"); |
||||
// Create unit file for the OSD
|
||||
const has_scsi_cache_type = options.disable_ssd_cache && |
||||
(await system("ls /sys/block/"+selected_ssd.path.substr(5)+"/device/scsi_disk/*/cache_type"))[0] == 0; |
||||
const write_through = options.disable_ssd_cache && ( |
||||
has_scsi_cache_type || selected_ssd.path.substr(5, 4) == 'nvme' |
||||
&& (await system_or_die("/sys/block/"+selected_ssd.path.substr(5)+"/queue/write_cache")).trim() == "write through"); |
||||
await fsp.writeFile('/etc/systemd/system/vitastor-osd'+osd_num+'.service', |
||||
`[Unit]
|
||||
Description=Vitastor object storage daemon osd.${osd_num} |
||||
After=network-online.target local-fs.target time-sync.target |
||||
Wants=network-online.target local-fs.target time-sync.target |
||||
PartOf=vitastor.target |
||||
|
||||
[Service] |
||||
LimitNOFILE=1048576 |
||||
LimitNPROC=1048576 |
||||
LimitMEMLOCK=infinity |
||||
ExecStart=bash -c '/usr/bin/vitastor-osd \\ |
||||
--osd_num ${osd_num} ${write_through |
||||
? "--disable_meta_fsync 1 --disable_journal_fsync 1 --immediate_commit "+(options.disable_hdd_cache ? "all" : "small") |
||||
: ""} \\ |
||||
--throttle_small_writes 1 \\ |
||||
--disk_alignment ${options.device_block_size} \\ |
||||
--journal_block_size ${options.device_block_size} \\ |
||||
--meta_block_size ${options.device_block_size} \\ |
||||
--journal_no_same_sector_overwrites true \\ |
||||
--journal_sector_buffer_count 1024 \\ |
||||
--block_size ${options.object_size} \\ |
||||
--data_device ${data_device} \\ |
||||
--journal_device ${journal_device} \\ |
||||
--meta_device ${meta_device} >>/var/log/vitastor/osd${osd_num}.log 2>&1' |
||||
WorkingDirectory=/ |
||||
ExecStartPre=+chown vitastor:vitastor ${data_device} |
||||
ExecStartPre=+chown vitastor:vitastor ${journal_device} |
||||
ExecStartPre=+chown vitastor:vitastor ${meta_device}${ |
||||
has_scsi_cache_type |
||||
? "\nExecStartPre=+bash -c 'D=$$$(readlink "+journal_device+"); echo write through > $$$(dirname /sys/block/*/$$\${D##*/})/device/scsi_disk/*/cache_type'" |
||||
: ""}${ |
||||
options.disable_hdd_cache |
||||
? "\nExecStartPre=+bash -c 'D=$$$(readlink "+data_device+"); echo write through > $$$(dirname /sys/block/*/$$\${D##*/})/device/scsi_disk/*/cache_type'" |
||||
: ""} |
||||
User=vitastor |
||||
PrivateTmp=false |
||||
TasksMax=infinity |
||||
Restart=always |
||||
StartLimitInterval=0 |
||||
RestartSec=10 |
||||
|
||||
[Install] |
||||
WantedBy=vitastor.target |
||||
`);
|
||||
await system_or_die("systemctl enable vitastor-osd"+osd_num); |
||||
} |
||||
|
||||
async function add_partitions(dev, sizes) |
||||
{ |
||||
let script = 'label: gpt\n\n'; |
||||
if (dev.parts) |
||||
{ |
||||
// Old partitions
|
||||
for (const part of dev.parts.partitions) |
||||
{ |
||||
script += part.node+': '+Object.keys(part).map(k => k == 'node' ? '' : k+'='+part[k]).filter(k => k).join(', ')+'\n'; |
||||
} |
||||
} |
||||
// New partitions
|
||||
for (const size of sizes) |
||||
{ |
||||
script += '+ '+Math.ceil(size/1024)+'KiB\n'; |
||||
} |
||||
await system_or_die('sfdisk '+dev.path, script); |
||||
// Get new partition table and find the new partition
|
||||
const newpt = JSON.parse(await system_or_die('sfdisk --dump '+dev.path+' --json')).partitiontable; |
||||
const old_nodes = dev.parts ? dev.parts.partitions.reduce((a, c) => { a[c.uuid] = true; return a; }, {}) : {}; |
||||
const new_nodes = newpt.partitions.filter(part => !old_nodes[part.uuid]); |
||||
if (new_nodes.length != sizes.length) |
||||
{ |
||||
console.error('Failed to partition '+dev.path+': new partitions not found in table'); |
||||
process.exit(1); |
||||
} |
||||
dev.parts = newpt; |
||||
dev.free = free_from_parttable(newpt); |
||||
return new_nodes; |
||||
} |
||||
|
||||
function free_from_parttable(pt) |
||||
{ |
||||
let free = pt.lastlba + 1 - pt.firstlba; |
||||
for (const part of pt.partitions) |
||||
{ |
||||
free -= part.size; |
||||
} |
||||
free *= pt.sectorsize; |
||||
return free; |
||||
} |
||||
|
||||
async function system_or_die(cmd, input = '') |
||||
{ |
||||
let [ exitcode, stdout, stderr ] = await system(cmd, input); |
||||
if (exitcode != 0) |
||||
{ |
||||
console.error(cmd+' failed: '+stderr); |
||||
process.exit(1); |
||||
} |
||||
return stdout; |
||||
} |
||||
|
||||
async function system(cmd, input = '') |
||||
{ |
||||
if (options.debug) |
||||
{ |
||||
process.stderr.write('+ '+cmd+(input ? " <<EOF\n"+input.replace(/\s*$/, '\n')+"EOF" : '')+'\n'); |
||||
} |
||||
const cp = child_process.spawn(cmd, { shell: true }); |
||||
let stdout = '', stderr = '', finish_cb; |
||||
cp.stdout.on('data', buf => stdout += buf.toString()); |
||||
cp.stderr.on('data', buf => stderr += buf.toString()); |
||||
cp.on('exit', () => finish_cb && finish_cb()); |
||||
cp.stdin.write(input); |
||||
cp.stdin.end(); |
||||
if (cp.exitCode == null) |
||||
{ |
||||
await new Promise(ok => finish_cb = ok); |
||||
} |
||||
return [ cp.exitCode, stdout, stderr ]; |
||||
} |
||||
|
||||
async function file_exists(filename) |
||||
{ |
||||
return new Promise((ok, no) => fs.access(filename, fs.constants.R_OK, err => ok(!err))); |
||||
} |
Loading…
Reference in new issue