41 changed files with 1887 additions and 0 deletions
@ -0,0 +1,55 @@ |
|||
#!/usr/bin/nodejs
|
|||
|
|||
const fs = require('fs'); |
|||
const yaml = require('yaml'); |
|||
|
|||
const L = { |
|||
en: {}, |
|||
ru: { |
|||
Type: 'Тип', |
|||
Default: 'Значение по умолчанию', |
|||
Minimum: 'Минимальное значение', |
|||
}, |
|||
}; |
|||
const types = { |
|||
en: { |
|||
string: 'string', |
|||
bool: 'boolean', |
|||
int: 'integer', |
|||
sec: 'seconds', |
|||
ms: 'milliseconds', |
|||
us: 'microseconds', |
|||
}, |
|||
ru: { |
|||
string: 'строка', |
|||
bool: 'булево (да/нет)', |
|||
int: 'целое число', |
|||
sec: 'секунды', |
|||
ms: 'миллисекунды', |
|||
us: 'микросекунды', |
|||
}, |
|||
}; |
|||
const params_files = fs.readdirSync(__dirname+'/params') |
|||
.filter(f => f.substr(-4) == '.yml') |
|||
.map(f => f.substr(0, f.length-4)); |
|||
|
|||
for (const file of params_files) |
|||
{ |
|||
const cfg = yaml.parse(fs.readFileSync(__dirname+'/params/'+file+'.yml', { encoding: 'utf-8' })); |
|||
for (const lang in types) |
|||
{ |
|||
let out = '\n\n{{< toc >}}'; |
|||
for (const c of cfg) |
|||
{ |
|||
out += `\n\n## ${c.name}\n\n`; |
|||
out += `- ${L[lang]['Type'] || 'Type'}: ${c["type_"+lang] || types[lang][c.type] || c.type}\n`; |
|||
if (c.default !== undefined) |
|||
out += `- ${L[lang]['Default'] || 'Default'}: ${c.default}\n`; |
|||
if (c.min !== undefined) |
|||
out += `- ${L[lang]['Minimum'] || 'Minimum'}: ${c.min}\n`; |
|||
out += `\n`+(c["info_"+lang] || c["info"]).replace(/\s+$/, ''); |
|||
} |
|||
const head = fs.readFileSync(__dirname+'/params/head/'+file+'.'+lang+'.md', { encoding: 'utf-8' }); |
|||
fs.writeFileSync(__dirname+'/hugo/content/config/'+file+'.'+lang+'.md', head.replace(/\s+$/, '')+out+"\n"); |
|||
} |
|||
} |
@ -0,0 +1,6 @@ |
|||
--- |
|||
title: "{{ replace .Name "-" " " | title }}" |
|||
date: {{ .Date }} |
|||
draft: true |
|||
--- |
|||
|
@ -0,0 +1,35 @@ |
|||
baseURL: http://localhost |
|||
title: Vitastor |
|||
theme: hugo-geekdoc |
|||
#languageCode: en-us |
|||
|
|||
pluralizeListTitles: false |
|||
|
|||
# Geekdoc required configuration |
|||
pygmentsUseClasses: true |
|||
pygmentsCodeFences: true |
|||
disablePathToLower: true |
|||
|
|||
# Required if you want to render robots.txt template |
|||
enableRobotsTXT: true |
|||
|
|||
defaultContentLanguage: en |
|||
languages: |
|||
en: |
|||
weight: 1 |
|||
languageName: English |
|||
ru: |
|||
weight: 1 |
|||
languageName: Русский |
|||
|
|||
markup: |
|||
goldmark: |
|||
renderer: |
|||
# Needed for mermaid shortcode |
|||
unsafe: true |
|||
tableOfContents: |
|||
startLevel: 1 |
|||
endLevel: 9 |
|||
|
|||
taxonomies: |
|||
tag: tags |
@ -0,0 +1,6 @@ |
|||
## The Idea |
|||
|
|||
Vitastor is a small, simple and fast clustered block storage (storage for VM drives), |
|||
architecturally similar to Ceph which means strong consistency, primary-replication, |
|||
symmetric clustering and automatic data distribution over any number of drives |
|||
of any size with configurable redundancy (replication or erasure codes/XOR). |
@ -0,0 +1,61 @@ |
|||
--- |
|||
title: Parameter Reference |
|||
weight: 1 |
|||
--- |
|||
|
|||
Vitastor configuration consists of: |
|||
- Configuration parameters (key-value), described here |
|||
- [Pool configuration]({{< ref "config/pool" >}}) |
|||
- OSD placement tree configuration |
|||
- Inode configuration i.e. image metadata like name, size and parent reference |
|||
|
|||
Configuration parameters can be set in 3 places: |
|||
- Configuration file (`/etc/vitastor/vitastor.conf` or other path) |
|||
- etcd key `/vitastor/config/global`. Most variables can be set there, but etcd |
|||
connection parameters should obviously be set in the configuration file. |
|||
- Command line of Vitastor components: OSD, mon, fio and QEMU options, |
|||
OpenStack/Proxmox/etc configuration. The latter doesn't allow to set all |
|||
variables directly, but it allows to override the configuration file and |
|||
set everything you need inside it. |
|||
|
|||
In the future, additional configuration methods may be added: |
|||
- OSD superblock which will, by design, contain parameters related to the disk |
|||
layout and to one specific OSD. |
|||
- OSD-specific keys in etcd like `/vitastor/config/osd/<number>`. |
|||
|
|||
## Common Parameters |
|||
|
|||
These are the most common parameters which apply to all components of Vitastor. |
|||
|
|||
[See the list]({{< ref "common" >}}) |
|||
|
|||
## Cluster-Wide Disk Layout Parameters |
|||
|
|||
These parameters apply to clients and OSDs and can't be changed after OSD |
|||
initialization. |
|||
|
|||
[See the list]({{< ref "layout-cluster" >}}) |
|||
|
|||
## OSD Disk Layout Parameters |
|||
|
|||
These parameters apply to OSDs and can't be changed after OSD initialization. |
|||
|
|||
[See the list]({{< ref "layout-osd" >}}) |
|||
|
|||
## Network Protocol Parameters |
|||
|
|||
These parameters apply to clients and OSDs and can be changed with a restart. |
|||
|
|||
[See the list]({{< ref "network" >}}) |
|||
|
|||
## Runtime OSD Parameters |
|||
|
|||
These parameters apply to OSDs and can be changed with an OSD restart. |
|||
|
|||
[See the list]({{< ref "osd" >}}) |
|||
|
|||
## Monitor Parameters |
|||
|
|||
These parameters only apply to Monitors. |
|||
|
|||
[See the list]({{< ref "monitor" >}}) |
@ -0,0 +1,63 @@ |
|||
--- |
|||
title: Перечень настроек |
|||
weight: 1 |
|||
--- |
|||
|
|||
Конфигурация Vitastor состоит из: |
|||
- Параметров (ключ-значение), описанных на данной странице |
|||
- Настроек пулов |
|||
- Настроек дерева OSD |
|||
- Настроек инодов, т.е. метаданных образов, таких, как имя, размер и ссылки на |
|||
родительский образ |
|||
|
|||
Параметры конфигурации могут задаваться в 3 местах: |
|||
- Файле конфигурации (`/etc/vitastor/vitastor.conf` или по другому пути) |
|||
- Ключе в etcd `/vitastor/config/global`. Большая часть параметров может |
|||
задаваться там, кроме, естественно, самих параметров соединения с etcd, |
|||
которые должны задаваться в файле конфигурации |
|||
- В командной строке компонентов Vitastor: OSD, монитора, опциях fio и QEMU, |
|||
настроек OpenStack, Proxmox и т.п. Последние, как правило, не включают полный |
|||
набор параметров напрямую, но разрешают определить путь к файлу конфигурации |
|||
и задать любые параметры в нём. |
|||
|
|||
В будущем также могут быть добавлены другие способы конфигурации: |
|||
- Суперблок OSD, в котором будут храниться параметры OSD, связанные с дисковым |
|||
форматом и с этим конкретным OSD. |
|||
- OSD-специфичные ключи в etcd типа `/vitastor/config/osd/<номер>`. |
|||
|
|||
## Общие параметры |
|||
|
|||
Это наиболее общие параметры, используемые всеми компонентами Vitastor. |
|||
|
|||
[Посмотреть список]({{< ref "common" >}}) |
|||
|
|||
## Дисковые параметры уровня кластера |
|||
|
|||
Эти параметры используются клиентами и OSD и не могут быть изменены после |
|||
инициализации OSD. |
|||
|
|||
[Посмотреть список]({{< ref "layout-cluster" >}}) |
|||
|
|||
## Дисковые параметры OSD |
|||
|
|||
Эти параметры используются OSD и не могут быть изменены после инициализации OSD. |
|||
|
|||
[Посмотреть список]({{< ref "layout-osd" >}}) |
|||
|
|||
## Параметры сетевого протокола |
|||
|
|||
Эти параметры используются клиентами и OSD и могут быть изменены с перезапуском. |
|||
|
|||
[Посмотреть список]({{< ref "network" >}}) |
|||
|
|||
## Изменяемые параметры OSD |
|||
|
|||
Эти параметры используются OSD и могут быть изменены с перезапуском. |
|||
|
|||
[Посмотреть список]({{< ref "osd" >}}) |
|||
|
|||
## Параметры мониторов |
|||
|
|||
Данные параметры используются только мониторами Vitastor. |
|||
|
|||
[Посмотреть список]({{< ref "monitor" >}}) |
@ -0,0 +1,178 @@ |
|||
--- |
|||
title: Pool configuration |
|||
weight: 100 |
|||
--- |
|||
|
|||
Pool configuration is set in etcd key `/vitastor/config/pools` in the following |
|||
JSON format: |
|||
|
|||
``` |
|||
{ |
|||
"<Numeric ID>": { |
|||
"name": "<name>", |
|||
...other parameters... |
|||
} |
|||
} |
|||
``` |
|||
|
|||
{{< toc >}} |
|||
|
|||
# Parameters |
|||
|
|||
## name |
|||
|
|||
- Type: string |
|||
- Required |
|||
|
|||
Pool name. |
|||
|
|||
## scheme |
|||
|
|||
- Type: string |
|||
- Required |
|||
- One of: "replicated", "xor" or "jerasure" |
|||
|
|||
Redundancy scheme used for data in this pool. |
|||
|
|||
## pg_size |
|||
|
|||
- Type: integer |
|||
- Required |
|||
|
|||
Total number of disks for PGs of this pool - i.e., number of replicas for |
|||
replicated pools and number of data plus parity disks for EC/XOR pools. |
|||
|
|||
## parity_chunks |
|||
|
|||
- Type: integer |
|||
|
|||
Number of parity chunks for EC/XOR pools. For such pools, data will be lost |
|||
if you lose more than parity_chunks disks at once, so this parameter can be |
|||
equally described as FTT (number of failures to tolerate). |
|||
|
|||
Required for EC/XOR pools, ignored for replicated pools. |
|||
|
|||
## pg_minsize |
|||
|
|||
- Type: integer |
|||
- Required |
|||
|
|||
Number of available live disks for PGs of this pool to remain active. |
|||
That is, if it becomes impossible to place PG data on at least (pg_minsize) |
|||
OSDs, PG is deactivated for both read and write. So you know that a fresh |
|||
write always goes to at least (pg_minsize) OSDs (disks). |
|||
|
|||
FIXME: pg_minsize behaviour may be changed in the future to only make PGs |
|||
read-only instead of deactivating them. |
|||
|
|||
## pg_count |
|||
|
|||
- Type: integer |
|||
- Required |
|||
|
|||
Number of PGs for this pool. The value should be big enough for the monitor / |
|||
LP solver to be able to optimize data placement. |
|||
|
|||
"Enough" is usually around 64-128 PGs per OSD, i.e. you set pg_count for pool |
|||
to (total OSD count * 100 / pg_size). You can round it to the closest power of 2, |
|||
because it makes it easier to reduce or increase PG count later by dividing or |
|||
multiplying it by 2. |
|||
|
|||
In Vitastor, PGs are ephemeral, so you can change pool PG count anytime just |
|||
by overwriting pool configuration in etcd. Amount of the data affected by |
|||
rebalance will be smaller if the new PG count is a multiple of the old PG count |
|||
or vice versa. |
|||
|
|||
## failure_domain |
|||
|
|||
- Type: string |
|||
- Default: host |
|||
|
|||
Failure domain specification. Must be "host" or "osd" or refer to one of the |
|||
placement tree levels, defined in [placement_levels]({{< ref "config/monitor#placement_levels" >}}). |
|||
|
|||
Two replicas, or two parts in case of EC/XOR, of the same block of data are |
|||
never put on OSDs in the same failure domain (for example, on the same host). |
|||
So failure domain specifies the unit which failure you are protecting yourself |
|||
from. |
|||
|
|||
## max_osd_combinations |
|||
|
|||
- Type: integer |
|||
- Default: 10000 |
|||
|
|||
Vitastor data placement algorithm is based on the LP solver and OSD combinations |
|||
which are fed to it are generated ramdonly. This parameter specifies the maximum |
|||
number of combinations to generate when optimising PG placement. |
|||
|
|||
This parameter usually doesn't require to be changed. |
|||
|
|||
## pg_stripe_size |
|||
|
|||
- Type: integer |
|||
- Default: 0 |
|||
|
|||
Specifies the stripe size for this pool according to which images are split into |
|||
different PGs. Stripe size can't be smaller than [block_size]({{< ref "config/layout-cluster#block_size" >}}) |
|||
multiplied by (pg_size - parity_chunks) for EC/XOR pools, or 1 for replicated pools, |
|||
and the same value is used by default. |
|||
|
|||
This means first `pg_stripe_size = (block_size * (pg_size-parity_chunks))` bytes |
|||
of an image go to one PG, next `pg_stripe_size` bytes go to another PG and so on. |
|||
|
|||
Usually doesn't require to be changed separately from the block size. |
|||
|
|||
## root_node |
|||
|
|||
- Type: string |
|||
|
|||
Specifies the root node of the OSD tree to restrict this pool OSDs to. |
|||
Referenced root node must exist in /vitastor/config/node_placement. |
|||
|
|||
## osd_tags |
|||
|
|||
- Type: string or array of strings |
|||
|
|||
Specifies OSD tags to restrict this pool to. If multiple tags are specified, |
|||
only OSDs having all of these tags will be used for this pool. |
|||
|
|||
## primary_affinity_tags |
|||
|
|||
- Type: string or array of strings |
|||
|
|||
Specifies OSD tags to prefer putting primary OSDs in this pool to. |
|||
Note that for EC/XOR pools Vitastor always prefers to put primary OSD on one |
|||
of the OSDs containing a data chunk for a PG. |
|||
|
|||
# Examples |
|||
|
|||
## Replicated pool |
|||
|
|||
``` |
|||
{ |
|||
"1": { |
|||
"name":"testpool", |
|||
"scheme":"replicated", |
|||
"pg_size":2, |
|||
"pg_minsize":1, |
|||
"pg_count":256, |
|||
"failure_domain":"host" |
|||
} |
|||
} |
|||
``` |
|||
|
|||
## Erasure-coded pool |
|||
|
|||
``` |
|||
{ |
|||
"2": { |
|||
"name":"ecpool", |
|||
"scheme":"jerasure", |
|||
"pg_size":3, |
|||
"parity_chunks":1, |
|||
"pg_minsize":2, |
|||
"pg_count":256, |
|||
"failure_domain":"host" |
|||
} |
|||
} |
|||
``` |
@ -0,0 +1,41 @@ |
|||
--- |
|||
title: Packages |
|||
weight: 2 |
|||
--- |
|||
|
|||
## Debian |
|||
|
|||
- Trust Vitastor package signing key: |
|||
`wget -q -O - https://vitastor.io/debian/pubkey | sudo apt-key add -` |
|||
- Add Vitastor package repository to your /etc/apt/sources.list: |
|||
- Debian 11 (Bullseye/Sid): `deb https://vitastor.io/debian bullseye main` |
|||
- Debian 10 (Buster): `deb https://vitastor.io/debian buster main` |
|||
- For Debian 10 (Buster) also enable backports repository: |
|||
`deb http://deb.debian.org/debian buster-backports main` |
|||
- Install packages: `apt update; apt install vitastor lp-solve etcd linux-image-amd64 qemu` |
|||
|
|||
## CentOS |
|||
|
|||
- Add Vitastor package repository: |
|||
- CentOS 7: `yum install https://vitastor.io/rpms/centos/7/vitastor-release-1.0-1.el7.noarch.rpm` |
|||
- CentOS 8: `dnf install https://vitastor.io/rpms/centos/8/vitastor-release-1.0-1.el8.noarch.rpm` |
|||
- Enable EPEL: `yum/dnf install epel-release` |
|||
- Enable additional CentOS repositories: |
|||
- CentOS 7: `yum install centos-release-scl` |
|||
- CentOS 8: `dnf install centos-release-advanced-virtualization` |
|||
- Enable elrepo-kernel: |
|||
- CentOS 7: `yum install https://www.elrepo.org/elrepo-release-7.el7.elrepo.noarch.rpm` |
|||
- CentOS 8: `dnf install https://www.elrepo.org/elrepo-release-8.el8.elrepo.noarch.rpm` |
|||
- Install packages: `yum/dnf install vitastor lpsolve etcd kernel-ml qemu-kvm` |
|||
|
|||
## Installation requirements |
|||
|
|||
- Linux kernel 5.4 or newer, for io_uring support. 5.8 or later is highly |
|||
recommended because io_uring is a relatively new technology and there is |
|||
at least one bug which reproduces with io_uring and HP SmartArray |
|||
controllers in 5.4 |
|||
- liburing 0.4 or newer |
|||
- lp_solve |
|||
- etcd 3.4.15 or newer. Earlier versions won't work because of various bugs, |
|||
for example [#12402](https://github.com/etcd-io/etcd/pull/12402). |
|||
- node.js 10 or newer |
@ -0,0 +1,72 @@ |
|||
--- |
|||
title: Quick Start |
|||
weight: 1 |
|||
--- |
|||
|
|||
Prepare: |
|||
|
|||
- Get some SATA or NVMe SSDs with capacitors (server-grade drives). You can use desktop SSDs |
|||
with lazy fsync, but prepare for inferior single-thread latency. Read more about capacitors |
|||
[here]({{< ref "config/layout-cluster#immediate_commit" >}}). |
|||
- Get a fast network (at least 10 Gbit/s). Something like Mellanox ConnectX-4 with RoCEv2 is ideal. |
|||
- Disable CPU powersaving: `cpupower idle-set -D 0 && cpupower frequency-set -g performance`. |
|||
- [Install Vitastor packages]({{< ref "installation/packages" >}}). |
|||
|
|||
## Configure monitors |
|||
|
|||
On the monitor hosts: |
|||
- Edit variables at the top of `/usr/lib/vitastor/mon/make-units.sh` to desired values. |
|||
- Create systemd units for the monitor and etcd: `/usr/lib/vitastor/mon/make-units.sh` |
|||
- Start etcd and monitors: `systemctl start etcd vitastor-mon` |
|||
|
|||
## Configure OSDs |
|||
|
|||
- Put etcd_address and osd_network into `/etc/vitastor/vitastor.conf`. Example: |
|||
``` |
|||
{ |
|||
"etcd_address": ["10.200.1.10:2379","10.200.1.11:2379","10.200.1.12:2379"], |
|||
"osd_network": "10.200.1.0/24" |
|||
} |
|||
``` |
|||
- Initialize OSDs: |
|||
- Simplest, SSD-only: `/usr/lib/vitastor/mon/make-osd.sh /dev/disk/by-partuuid/XXX [/dev/disk/by-partuuid/YYY ...]` |
|||
- Hybrid, HDD+SSD: `/usr/lib/vitastor/mon/make-osd-hybrid.js /dev/sda /dev/sdb ...` — pass all your |
|||
devices (HDD and SSD) to this script — it will partition disks and initialize journals on its own. |
|||
This script skips HDDs which are already partitioned so if you want to use non-empty disks for |
|||
Vitastor you should first wipe them with `wipefs -a`. SSDs with GPT partition table are not skipped, |
|||
but some free unpartitioned space must be available because the script creates new partitions for journals. |
|||
- You can change OSD configuration in units or in `vitastor.conf`. |
|||
Check [Configuration Reference]({{< ref "config" >}}) for parameter descriptions. |
|||
- `systemctl start vitastor.target` everywhere. |
|||
- If all your drives have capacitors, create global configuration in etcd: \ |
|||
`etcdctl --endpoints=... put /vitastor/config/global '{"immediate_commit":"all"}'` |
|||
|
|||
## Create a pool |
|||
|
|||
Create pool configuration in etcd: |
|||
|
|||
``` |
|||
etcdctl --endpoints=... put /vitastor/config/pools '{"1":{"name":"testpool", |
|||
"scheme":"replicated","pg_size":2,"pg_minsize":1,"pg_count":256,"failure_domain":"host"}}' |
|||
``` |
|||
|
|||
For jerasure pools the configuration should look like the following: |
|||
|
|||
``` |
|||
etcdctl --endpoints=... put /vitastor/config/pools '{"2":{"name":"ecpool", |
|||
"scheme":"jerasure","pg_size":4,"parity_chunks":2,"pg_minsize":2,"pg_count":256,"failure_domain":"host"}` |
|||
``` |
|||
|
|||
After you do this, one of the monitors will configure PGs and OSDs will start them. |
|||
|
|||
You can check PG states with `etcdctl --endpoints=... get --prefix /vitastor/pg/state`. All PGs should become 'active'. |
|||
|
|||
## Create an image |
|||
|
|||
Use vitastor-cli ([read CLI documentation here]({{< ref "usage/cli" >}})): |
|||
|
|||
``` |
|||
vitastor-cli create -s 10G testimg |
|||
``` |
|||
|
|||
After that, you can run benchmarks or start QEMU manually with this image. |
@ -0,0 +1,54 @@ |
|||
--- |
|||
title: Building from Source |
|||
weight: 3 |
|||
--- |
|||
|
|||
## Requirements |
|||
|
|||
- gcc and g++ 8 or newer, clang 10 or newer, or other compiler with C++11 plus |
|||
designated initializers support from C++20 |
|||
- CMake |
|||
- liburing, jerasure headers |
|||
|
|||
## Basic instructions |
|||
|
|||
Download source, for example using git: `git clone --recurse-submodules https://yourcmc.ru/git/vitalif/vitastor/` |
|||
|
|||
Get `fio` source and symlink it into `<vitastor>/fio`. If you don't want to build fio engine, |
|||
you can disable it by passing `-DWITH_FIO=no` to cmake. |
|||
|
|||
Build and install Vitastor: |
|||
|
|||
``` |
|||
cd vitastor |
|||
mkdir build |
|||
cd build |
|||
cmake .. && make -j8 install |
|||
``` |
|||
|
|||
## QEMU Driver |
|||
|
|||
It's recommended to build the QEMU driver (qemu_driver.c) in-tree, as a part of |
|||
QEMU build process. To do that: |
|||
- Install vitastor client library headers (from source or from vitastor-client-dev package) |
|||
- Take a corresponding patch from `patches/qemu-*-vitastor.patch` and apply it to QEMU source |
|||
- Copy `src/qemu_driver.c` to QEMU source directory as `block/block-vitastor.c` |
|||
- Build QEMU as usual |
|||
|
|||
But it is also possible to build it out-of-tree. To do that: |
|||
- Get QEMU source, begin to build it, stop the build and copy headers: |
|||
- `<qemu>/include` → `<vitastor>/qemu/include` |
|||
- Debian: |
|||
* Use qemu packages from the main repository |
|||
* `<qemu>/b/qemu/config-host.h` → `<vitastor>/qemu/b/qemu/config-host.h` |
|||
* `<qemu>/b/qemu/qapi` → `<vitastor>/qemu/b/qemu/qapi` |
|||
- CentOS 8: |
|||
* Use qemu packages from the Advanced-Virtualization repository. To enable it, run |
|||
`yum install centos-release-advanced-virtualization.noarch` and then `yum install qemu` |
|||
* `<qemu>/config-host.h` → `<vitastor>/qemu/b/qemu/config-host.h` |
|||
* For QEMU 3.0+: `<qemu>/qapi` → `<vitastor>/qemu/b/qemu/qapi` |
|||
* For QEMU 2.0+: `<qemu>/qapi-types.h` → `<vitastor>/qemu/b/qemu/qapi-types.h` |
|||
- `config-host.h` and `qapi` are required because they contain generated headers |
|||
- Configure Vitastor with `WITH_QEMU=yes` and, if you're on RHEL, also with `QEMU_PLUGINDIR=qemu-kvm`: |
|||
`cmake .. -DWITH_QEMU=yes`. |
|||
- After that, Vitastor will build `block-vitastor.so` during its build process. |
@ -0,0 +1,4 @@ |
|||
--- |
|||
title: Introduction |
|||
weight: -1 |
|||
--- |
@ -0,0 +1,73 @@ |
|||
--- |
|||
title: Architecture |
|||
weight: 3 |
|||
--- |
|||
|
|||
For people familiar with Ceph, Vitastor is quite similar: |
|||
|
|||
- Vitastor also has Pools, PGs, OSDs, Monitors, Failure Domains, Placement Tree: |
|||
- OSD (Object Storage Daemon) is a process that stores data and serves read/write requests. |
|||
- PG (Placement Group) is a container for data that (normally) shares the same replicas. |
|||
- Pool is a container for data that has the same redundancy scheme and placement rules. |
|||
- Monitor is a separate daemon that watches cluster state and controls data distribution. |
|||
- Failure Domain is a group of OSDs that you allow to fail. It's "host" by default. |
|||
- Placement Tree groups OSDs in a hierarchy to later split them into Failure Domains. |
|||
- Vitastor also distributes every image data across the whole cluster. |
|||
- Vitastor is also transactional (every write to the cluster is atomic). |
|||
- OSDs also have journal and metadata and they can also be put on separate drives. |
|||
- Just like in Ceph, client library attempts to recover from any cluster failure so |
|||
you can basically reboot the whole cluster and only pause, but not crash, your clients |
|||
(please report a bug if the client crashes in that case). |
|||
|
|||
However, there are also differences: |
|||
|
|||
- Vitastor's main focus is on SSDs. Hybrid SSD+HDD setups are also possible. |
|||
- Vitastor OSD is (and will always be) single-threaded. If you want to dedicate more than 1 core |
|||
per drive you should run multiple OSDs each on a different partition of the drive. |
|||
Vitastor isn't CPU-hungry though (as opposed to Ceph), so 1 core is sufficient in a lot of cases. |
|||
- Metadata and journal are always kept in memory. Metadata size depends linearly on drive capacity |
|||
and data store block size which is 128 KB by default. With 128 KB blocks metadata should occupy |
|||
around 512 MB per 1 TB (which is still less than Ceph wants). Journal doesn't have to be big, |
|||
the example test below was conducted with only 16 MB journal. A big journal is probably even |
|||
harmful as dirty write metadata also take some memory. |
|||
- Vitastor storage layer doesn't have internal copy-on-write or redirect-write. I know that maybe |
|||
it's possible to create a good copy-on-write storage, but it's much harder and makes performance |
|||
less deterministic, so CoW isn't used in Vitastor. |
|||
- The basic layer of Vitastor is block storage with fixed-size blocks, not object storage with |
|||
rich semantics like in Ceph (RADOS). |
|||
- There's a "lazy fsync" mode which allows to batch writes before flushing them to the disk. |
|||
This allows to use Vitastor with desktop SSDs, but still lowers performance due to additional |
|||
network roundtrips, so use server SSDs with capacitor-based power loss protection |
|||
("Advanced Power Loss Protection") for best performance. |
|||
- PGs are ephemeral. This means that they aren't stored on data disks and only exist in memory |
|||
while OSDs are running. |
|||
- Recovery process is per-object (per-block), not per-PG. Also there are no PGLOGs. |
|||
- Monitors don't store data. Cluster configuration and state is stored in etcd in simple human-readable |
|||
JSON structures. Monitors only watch cluster state and handle data movement. |
|||
Thus Vitastor's Monitor isn't a critical component of the system and is more similar to Ceph's Manager. |
|||
Vitastor's Monitor is implemented in node.js. |
|||
- PG distribution isn't based on consistent hashes. All PG mappings are stored in etcd. |
|||
Rebalancing PGs between OSDs is done by mathematical optimization - data distribution problem |
|||
is reduced to a linear programming problem and solved by lp_solve. This allows for almost |
|||
perfect (96-99% uniformity compared to Ceph's 80-90%) data distribution in most cases, ability |
|||
to map PGs by hand without breaking rebalancing logic, reduced OSD peer-to-peer communication |
|||
(on average, OSDs have fewer peers) and less data movement. It also probably has a drawback - |
|||
this method may fail in very large clusters, but up to several hundreds of OSDs it's perfectly fine. |
|||
It's also easy to add consistent hashes in the future if something proves their necessity. |
|||
- There's no separate CRUSH layer. You select pool redundancy scheme, placement root, failure domain |
|||
and so on directly in pool configuration. |
|||
- Images are global i.e. you can't create multiple images with the same name in different pools. |
|||
|
|||
## Implementation Principles |
|||
|
|||
- I like architecturally simple solutions. Vitastor is and will always be designed |
|||
exactly like that. |
|||
- I also like reinventing the wheel to some extent, like writing my own HTTP client |
|||
for etcd interaction instead of using prebuilt libraries, because in this case |
|||
I'm confident about what my code does and what it doesn't do. |
|||
- I don't care about C++ "best practices" like RAII or proper inheritance or usage of |
|||
smart pointers or whatever and I don't intend to change my mind, so if you're here |
|||
looking for ideal reference C++ code, this probably isn't the right place. |
|||
- I like node.js better than any other dynamically-typed language interpreter |
|||
because it's faster than any other interpreter in the world, has neutral C-like |
|||
syntax and built-in event loop. That's why Monitor is implemented in node.js. |
@ -0,0 +1,34 @@ |
|||
--- |
|||
title: Author and License |
|||
weight: 3 |
|||
--- |
|||
|
|||
Copyright (c) Vitaliy Filippov (vitalif [at] yourcmc.ru), 2019+ |
|||
|
|||
Join Vitastor Telegram Chat: https://t.me/vitastor |
|||
|
|||
All server-side code (OSD, Monitor and so on) is licensed under the terms of |
|||
Vitastor Network Public License 1.1 (VNPL 1.1), a copyleft license based on |
|||
GNU GPLv3.0 with the additional "Network Interaction" clause which requires |
|||
opensourcing all programs directly or indirectly interacting with Vitastor |
|||
through a computer network and expressly designed to be used in conjunction |
|||
with it ("Proxy Programs"). Proxy Programs may be made public not only under |
|||
the terms of the same license, but also under the terms of any GPL-Compatible |
|||
Free Software License, as listed by the Free Software Foundation. |
|||
This is a stricter copyleft license than the Affero GPL. |
|||
|
|||
Please note that VNPL doesn't require you to open the code of proprietary |
|||
software running inside a VM if it's not specially designed to be used with |
|||
Vitastor. |
|||
|
|||
Basically, you can't use the software in a proprietary environment to provide |
|||
its functionality to users without opensourcing all intermediary components |
|||
standing between the user and Vitastor or purchasing a commercial license |
|||
from the author 😀. |
|||
|
|||
Client libraries (cluster_client and so on) are dual-licensed under the same |
|||
VNPL 1.1 and also GNU GPL 2.0 or later to allow for compatibility with GPLed |
|||
software like QEMU and fio. |
|||
|
|||
You can find the full text of VNPL-1.1 in the file [VNPL-1.1.txt](VNPL-1.1.txt). |
|||
GPL 2.0 is also included in this repository as [GPL-2.0.txt](GPL-2.0.txt). |
@ -0,0 +1,60 @@ |
|||
--- |
|||
title: Features |
|||
weight: 1 |
|||
--- |
|||
|
|||
Vitastor is currently a pre-release and it still misses some important features. |
|||
However, the following is implemented: |
|||
|
|||
- Basic part: highly-available block storage with symmetric clustering and no SPOF |
|||
- Performance ;-D |
|||
- Multiple redundancy schemes: Replication, XOR n+1, Reed-Solomon erasure codes |
|||
based on jerasure library with any number of data and parity drives in a group |
|||
- Configuration via simple JSON data structures in etcd (parameters, pools and images) |
|||
- Automatic data distribution over OSDs, with support for: |
|||
- Mathematical optimization for better uniformity and less data movement |
|||
- Multiple pools |
|||
- Placement tree, OSD selection by tags (device classes) and placement root |
|||
- Configurable failure domains |
|||
- Recovery of degraded blocks |
|||
- Rebalancing (data movement between OSDs) |
|||
- Lazy fsync support |
|||
- Per-OSD and per-image I/O and space usage statistics in etcd |
|||
- Snapshots and copy-on-write image clones |
|||
- Write throttling to smooth random write workloads in SSD+HDD configurations |
|||
- RDMA/RoCEv2 support via libibverbs |
|||
|
|||
CLI (vitastor-cli): |
|||
- Pool listing and space stats (df) |
|||
- Image listing, space and I/O stats (ls) |
|||
- Image and snapshot creation (create, modify) |
|||
- Image removal and snapshot merge (rm, flatten, merge, rm-data) |
|||
|
|||
Plugins and packaging: |
|||
- Debian and CentOS packages |
|||
- Generic user-space client library |
|||
- Native QEMU driver |
|||
- Loadable fio engine for benchmarks |
|||
- NBD proxy for kernel mounts |
|||
- CSI plugin for Kubernetes |
|||
- OpenStack support: Cinder driver, Nova and libvirt patches |
|||
- Proxmox storage plugin and packages |
|||
|
|||
## Roadmap |
|||
|
|||
The following features are planned for the future: |
|||
|
|||
- Better OSD creation and auto-start tools |
|||
- Other administrative tools |
|||
- Web GUI |
|||
- OpenNebula plugin |
|||
- iSCSI proxy |
|||
- Simplified NFS proxy |
|||
- Multi-threaded client |
|||
- Faster failover |
|||
- Scrubbing without checksums (verification of replicas) |
|||
- Checksums |
|||
- Tiered storage (SSD caching) |
|||
- NVDIMM support |
|||
- Compression (possibly) |
|||
- Read caching using system page cache (possibly) |
@ -0,0 +1,93 @@ |
|||
--- |
|||
title: Example Comparison with Ceph |
|||
weight: 4 |
|||
--- |
|||
|
|||
Hardware configuration: 4 nodes, each with: |
|||
- 6x SATA SSD Intel D3-S4510 3.84 TB |
|||
- 2x Xeon Gold 6242 (16 cores @ 2.8 GHz) |
|||
- 384 GB RAM |
|||
- 1x 25 GbE network interface (Mellanox ConnectX-4 LX), connected to a Juniper QFX5200 switch |
|||
|
|||
CPU powersaving was disabled. Both Vitastor and Ceph were configured with 2 OSDs per 1 SSD. |
|||
|
|||
All of the results below apply to 4 KB blocks and random access (unless indicated otherwise). |
|||
|
|||
T8Q64 tests were conducted over 8 400GB RBD images from all hosts (every host was running 2 instances of fio). |
|||
This is because Ceph has performance penalties related to running multiple clients over a single RBD image. |
|||
|
|||
cephx_sign_messages was set to false during tests, RocksDB and Bluestore settings were left at defaults. |
|||
|
|||
T8Q64 read test was conducted over 1 larger inode (3.2T) from all hosts (every host was running 2 instances of fio). |
|||
Vitastor has no performance penalties related to running multiple clients over a single inode. |
|||
If conducted from one node with all primary OSDs moved to other nodes the result was slightly lower (689000 iops), |
|||
this is because all operations resulted in network roundtrips between the client and the primary OSD. |
|||
When fio was colocated with OSDs (like in Ceph benchmarks above), 1/4 of the read workload actually |
|||
used the loopback network. |
|||
|
|||
Vitastor was configured with: `--disable_data_fsync true --immediate_commit all --flusher_count 8 |
|||
--disk_alignment 4096 --journal_block_size 4096 --meta_block_size 4096 |
|||
--journal_no_same_sector_overwrites true --journal_sector_buffer_count 1024 |
|||
--journal_size 16777216`. |
|||
|
|||
## Raw drive performance |
|||
|
|||
- T1Q1 write ~27000 iops (~0.037ms latency) |
|||
- T1Q1 read ~9800 iops (~0.101ms latency) |
|||
- T1Q32 write ~60000 iops |
|||
- T1Q32 read ~81700 iops |
|||
|
|||
## 2 replicas |
|||
|
|||
### Ceph 15.2.4 (Bluestore) |
|||
|
|||
- T1Q1 write ~1000 iops (~1ms latency) |
|||
- T1Q1 read ~1750 iops (~0.57ms latency) |
|||
- T8Q64 write ~100000 iops, total CPU usage by OSDs about 40 virtual cores on each node |
|||
- T8Q64 read ~480000 iops, total CPU usage by OSDs about 40 virtual cores on each node |
|||
|
|||
In fact, not that bad for Ceph. These servers are an example of well-balanced Ceph nodes. |
|||
However, CPU usage and I/O latency were through the roof, as usual. |
|||
|
|||
### Vitastor 0.4.0 (native) |
|||
|
|||
- T1Q1 write: 7087 iops (0.14ms latency) |
|||
- T1Q1 read: 6838 iops (0.145ms latency) |
|||
- T2Q64 write: 162000 iops, total CPU usage by OSDs about 3 virtual cores on each node |
|||
- T8Q64 read: 895000 iops, total CPU usage by OSDs about 4 virtual cores on each node |
|||
- Linear write (4M T1Q32): 2800 MB/s |
|||
- Linear read (4M T1Q32): 1500 MB/s |
|||
|
|||
### Vitastor 0.4.0 (NBD) |
|||
|
|||
NBD is currently required to mount Vitastor via kernel, but it imposes additional overhead |
|||
due to additional copying between the kernel and userspace. This mostly hurts linear |
|||
bandwidth, not iops. |
|||
|
|||
Vitastor with single-threaded NBD on the same hardware: |
|||
- T1Q1 write: 6000 iops (0.166ms latency) |
|||
- T1Q1 read: 5518 iops (0.18ms latency) |
|||
- T1Q128 write: 94400 iops |
|||
- T1Q128 read: 103000 iops |
|||
- Linear write (4M T1Q128): 1266 MB/s (compared to 2800 MB/s via fio) |
|||
- Linear read (4M T1Q128): 975 MB/s (compared to 1500 MB/s via fio) |
|||
|
|||
## EC/XOR 2+1 |
|||
|
|||
### Ceph 15.2.4 |
|||
|
|||
- T1Q1 write: 730 iops (~1.37ms latency) |
|||
- T1Q1 read: 1500 iops with cold cache (~0.66ms latency), 2300 iops after 2 minute metadata cache warmup (~0.435ms latency) |
|||
- T4Q128 write (4 RBD images): 45300 iops, total CPU usage by OSDs about 30 virtual cores on each node |
|||
- T8Q64 read (4 RBD images): 278600 iops, total CPU usage by OSDs about 40 virtual cores on each node |
|||
- Linear write (4M T1Q32): 1950 MB/s before preallocation, 2500 MB/s after preallocation |
|||
- Linear read (4M T1Q32): 2400 MB/s |
|||
|
|||
### Vitastor 0.4.0 |
|||
|
|||
- T1Q1 write: 2808 iops (~0.355ms latency) |
|||
- T1Q1 read: 6190 iops (~0.16ms latency) |
|||
- T2Q64 write: 85500 iops, total CPU usage by OSDs about 3.4 virtual cores on each node |
|||
- T8Q64 read: 812000 iops, total CPU usage by OSDs about 4.7 virtual cores on each node |
|||
- Linear write (4M T1Q32): 3200 MB/s |
|||
- Linear read (4M T1Q32): 1800 MB/s |
@ -0,0 +1,46 @@ |
|||
--- |
|||
title: Vitastor's Theoretical Maximum Performance |
|||
weight: 3 |
|||
--- |
|||
|
|||
Replicated setups: |
|||
- Single-threaded (T1Q1) read latency: 1 network roundtrip + 1 disk read. |
|||
- Single-threaded write+fsync latency: |
|||
- With immediate commit: 2 network roundtrips + 1 disk write. |
|||
- With lazy commit: 4 network roundtrips + 1 disk write + 1 disk flush. |
|||
- Saturated parallel read iops: min(network bandwidth, sum(disk read iops)). |
|||
- Saturated parallel write iops: min(network bandwidth, sum(disk write iops / number of replicas / write amplification)). |
|||
|
|||
EC/XOR setups: |
|||
- Single-threaded (T1Q1) read latency: 1.5 network roundtrips + 1 disk read. |
|||
- Single-threaded write+fsync latency: |
|||
- With immediate commit: 3.5 network roundtrips + 1 disk read + 2 disk writes. |
|||
- With lazy commit: 5.5 network roundtrips + 1 disk read + 2 disk writes + 2 disk fsyncs. |
|||
- 0.5 in actually (k-1)/k which means that an additional roundtrip doesn't happen when |
|||
the read sub-operation can be served locally. |
|||
- Saturated parallel read iops: min(network bandwidth, sum(disk read iops)). |
|||
- Saturated parallel write iops: min(network bandwidth, sum(disk write iops * number of data drives / (number of data + parity drives) / write amplification)). |
|||
In fact, you should put disk write iops under the condition of ~10% reads / ~90% writes in this formula. |
|||
|
|||
Write amplification for 4 KB blocks is usually 3-5 in Vitastor: |
|||
1. Journal block write |
|||
2. Journal data write |
|||
3. Metadata block write |
|||
4. Another journal block write for EC/XOR setups |
|||
5. Data block write |
|||
|
|||
If you manage to get an SSD which handles 512 byte blocks well (Optane?) you may |
|||
lower 1, 3 and 4 to 512 bytes (1/8 of data size) and get WA as low as 2.375. |
|||
|
|||
Lazy fsync also reduces WA for parallel workloads because journal blocks are only |
|||
written when they fill up or fsync is requested. |
|||
|
|||
## In Practice |
|||
|
|||
In practice, using tests from [Understanding Performance]({{< ref "performance/understanding" >}}) |
|||
and good server-grade SSD/NVMe drives, you should head for: |
|||
- At least 5000 T1Q1 replicated read and write iops (maximum 0.2ms latency) |
|||
- At least ~80k parallel read iops or ~30k write iops per 1 core (1 OSD) |
|||
- Disk-speed or wire-speed linear reads and writes, whichever is the bottleneck in your case |
|||
|
|||
If your results are lower, that may mean you have bad drives, bad network or some kind of misconfiguration. |
@ -0,0 +1,6 @@ |
|||
--- |
|||
title: Tuning |
|||
weight: 2 |
|||
--- |
|||
|
|||
- Disable CPU powersaving |
@ -0,0 +1,52 @@ |
|||
--- |
|||
title: Understanding Storage Performance |
|||
weight: 1 |
|||
--- |
|||
|
|||
The most important thing for fast storage is latency, not parallel iops. |
|||
|
|||
The best possible latency is achieved with one thread and queue depth of 1 which basically means |
|||
"client load as low as possible". In this case IOPS = 1/latency, and this number doesn't |
|||
scale with number of servers, drives, server processes or threads and so on. |
|||
Single-threaded IOPS and latency numbers only depend on *how fast a single daemon is*. |
|||
|
|||
Why is it important? It's important because some of the applications *can't* use |
|||
queue depth greater than 1 because their task isn't parallelizable. A notable example |
|||
is any ACID DBMS because all of them write their WALs sequentially with fsync()s. |
|||
|
|||
fsync, by the way, is another important thing often missing in benchmarks. The point is |
|||
that drives have cache buffers and don't guarantee that your data is actually persisted |
|||
until you call fsync() which is translated to a FLUSH CACHE command by the OS. |
|||
|
|||
Desktop SSDs are very fast without fsync - NVMes, for example, can process ~80000 write |
|||
operations per second with queue depth of 1 without fsync - but they're really slow with |
|||
fsync because they have to actually write data to flash chips when you call fsync. Typical |
|||
number is around 1000-2000 iops with fsync. |
|||
|
|||
Server SSDs often have supercapacitors that act as a built-in UPS and allow the drive |
|||
to flush its DRAM cache to the persistent flash storage when a power loss occurs. |
|||
This makes them perform equally well with and without fsync. This feature is called |
|||
"Advanced Power Loss Protection" by Intel; other vendors either call it similarly |
|||
or directly as "Full Capacitor-Based Power Loss Protection". |
|||
|
|||
All software-defined storages that I currently know are slow in terms of latency. |
|||
Notable examples are Ceph and internal SDSes used by cloud providers like Amazon, Google, |
|||
Yandex and so on. They're all slow and can only reach ~0.3ms read and ~0.6ms 4 KB write latency |
|||
with best-in-slot hardware. |
|||
|
|||
And that's in the SSD era when you can buy an SSD that has ~0.04ms latency for 100 $. |
|||
|
|||
I use the following 6 commands with small variations to benchmark any storage: |
|||
|
|||
- Linear write: |
|||
`fio -ioengine=libaio -direct=1 -invalidate=1 -name=test -bs=4M -iodepth=32 -rw=write -runtime=60 -filename=/dev/sdX` |
|||
- Linear read: |
|||
`fio -ioengine=libaio -direct=1 -invalidate=1 -name=test -bs=4M -iodepth=32 -rw=read -runtime=60 -filename=/dev/sdX` |
|||
- Random write latency (T1Q1, this hurts storages the most): |
|||
`fio -ioengine=libaio -direct=1 -invalidate=1 -name=test -bs=4k -iodepth=1 -fsync=1 -rw=randwrite -runtime=60 -filename=/dev/sdX` |
|||
- Random read latency (T1Q1): |
|||
`fio -ioengine=libaio -direct=1 -invalidate=1 -name=test -bs=4k -iodepth=1 -rw=randread -runtime=60 -filename=/dev/sdX` |
|||
- Parallel write iops (use numjobs if a single CPU core is insufficient to saturate the load): |
|||
`fio -ioengine=libaio -direct=1 -invalidate=1 -name=test -bs=4k -iodepth=128 [-numjobs=4 -group_reporting] -rw=randwrite -runtime=60 -filename=/dev/sdX` |
|||
- Parallel read iops (use numjobs if a single CPU core is insufficient to saturate the load): |
|||
`fio -ioengine=libaio -direct=1 -invalidate=1 -name=test -bs=4k -iodepth=128 [-numjobs=4 -group_reporting] -rw=randread -runtime=60 -filename=/dev/sdX` |
@ -0,0 +1,183 @@ |
|||
--- |
|||
title: Vitastor CLI |
|||
weight: 1 |
|||
--- |
|||
|
|||
vitastor-cli is a command-line tool for administrative tasks like image management. |
|||
|
|||
It supports the following commands: |
|||
|
|||
{{< toc >}} |
|||
|
|||
Global options: |
|||
|
|||
``` |
|||
--etcd_address ADDR Etcd connection address |
|||
--iodepth N Send N operations in parallel to each OSD when possible (default 32) |
|||
--parallel_osds M Work with M osds in parallel when possible (default 4) |
|||
--progress 1|0 Report progress (default 1) |
|||
--cas 1|0 Use online CAS writes when possible (default auto) |
|||
--no-color Disable colored output |
|||
--json JSON output |
|||
``` |
|||
|
|||
## status |
|||
|
|||
`vitastor-cli status` |
|||
|
|||
Show cluster status. |
|||
|
|||
Example output: |
|||
|
|||
``` |
|||
cluster: |
|||
etcd: 1 / 1 up, 1.8 M database size |
|||
mon: 1 up, master stump |
|||
osd: 8 / 12 up |
|||
|
|||
data: |
|||
raw: 498.5 G used, 301.2 G / 799.7 G available, 399.8 G down |
|||
state: 156.6 G clean, 97.6 G misplaced |
|||
pools: 2 / 3 active |
|||
pgs: 30 active |
|||
34 active+has_misplaced |
|||
32 offline |
|||
|
|||
io: |
|||
client: 0 B/s rd, 0 op/s rd, 0 B/s wr, 0 op/s wr |
|||
rebalance: 989.8 M/s, 7.9 K op/s |
|||
``` |
|||
|
|||
## df |
|||
|
|||
`vitastor-cli df` |
|||
|
|||
Show pool space statistics. |
|||
|
|||
Example output: |
|||
|
|||
``` |
|||
NAME SCHEME PGS TOTAL USED AVAILABLE USED% EFFICIENCY |
|||
testpool 2/1 32 100 G 34.2 G 60.7 G 39.23% 100% |
|||
size1 1/1 32 199.9 G 10 G 121.5 G 39.23% 100% |
|||
kaveri 2/1 32 0 B 10 G 0 B 100% 0% |
|||
``` |
|||
|
|||
In the example above, "kaveri" pool has "zero" efficiency because all its OSD are down. |
|||
|
|||
## ls |
|||
|
|||
`vitastor-cli ls [-l] [-p POOL] [--sort FIELD] [-r] [-n N] [<glob> ...]` |
|||
|
|||
List images (only matching `<glob>` pattern(s) if passed). |
|||
|
|||
Options: |
|||
|
|||
``` |
|||
-p|--pool POOL Filter images by pool ID or name |
|||
-l|--long Also report allocated size and I/O statistics |
|||
--del Also include delete operation statistics |
|||
--sort FIELD Sort by specified field (name, size, used_size, <read|write|delete>_<iops|bps|lat|queue>) |
|||
-r|--reverse Sort in descending order |
|||
-n|--count N Only list first N items |
|||
``` |
|||
|
|||
Example output: |
|||
|
|||
``` |
|||
NAME POOL SIZE USED READ IOPS QUEUE LAT WRITE IOPS QUEUE LAT FLAGS PARENT |
|||
debian9 testpool 20 G 12.3 G 0 B/s 0 0 0 us 0 B/s 0 0 0 us RO |
|||
pve/vm-100-disk-0 testpool 20 G 0 B 0 B/s 0 0 0 us 0 B/s 0 0 0 us - debian9 |
|||
pve/base-101-disk-0 testpool 20 G 0 B 0 B/s 0 0 0 us 0 B/s 0 0 0 us RO debian9 |
|||
pve/vm-102-disk-0 testpool 32 G 36.4 M 0 B/s 0 0 0 us 0 B/s 0 0 0 us - pve/base-101-disk-0 |
|||
debian9-test testpool 20 G 36.6 M 0 B/s 0 0 0 us 0 B/s 0 0 0 us - debian9 |
|||
bench testpool 10 G 10 G 0 B/s 0 0 0 us 0 B/s 0 0 0 us - |
|||
bench-kaveri kaveri 10 G 10 G 0 B/s 0 0 0 us 0 B/s 0 0 0 us - |
|||
``` |
|||
|
|||
## create |
|||
|
|||
`vitastor-cli create -s|--size <size> [-p|--pool <id|name>] [--parent <parent_name>[@<snapshot>]] <name>` |
|||
|
|||
Create an image. You may use K/M/G/T suffixes for `<size>`. If `--parent` is specified, |
|||
a copy-on-write image clone is created. Parent must be a snapshot (readonly image). |
|||
Pool must be specified if there is more than one pool. |
|||
|
|||
``` |
|||
vitastor-cli create --snapshot <snapshot> [-p|--pool <id|name>] <image> |
|||
vitastor-cli snap-create [-p|--pool <id|name>] <image>@<snapshot> |
|||
``` |
|||
|
|||
Create a snapshot of image `<name>` (either form can be used). May be used live if only a single writer is active. |
|||
|
|||
## modify |
|||
|
|||
`vitastor-cli modify <name> [--rename <new-name>] [--resize <size>] [--readonly | --readwrite] [-f|--force]` |
|||
|
|||
Rename, resize image or change its readonly status. Images with children can't be made read-write. |
|||
If the new size is smaller than the old size, extra data will be purged. |
|||
You should resize file system in the image, if present, before shrinking it. |
|||
|
|||
``` |
|||
-f|--force Proceed with shrinking or setting readwrite flag even if the image has children. |
|||
``` |
|||
|
|||
## rm |
|||
|
|||
`vitastor-cli rm <from> [<to>] [--writers-stopped]` |
|||
|
|||
Remove `<from>` or all layers between `<from>` and `<to>` (`<to>` must be a child of `<from>`), |
|||
rebasing all their children accordingly. --writers-stopped allows merging to be a bit |
|||
more effective in case of a single 'slim' read-write child and 'fat' removed parent: |
|||
the child is merged into parent and parent is renamed to child in that case. |
|||
In other cases parent layers are always merged into children. |
|||
|
|||
## flatten |
|||
|
|||
`vitastor-cli flatten <layer>` |
|||
|
|||
Flatten a layer, i.e. merge data and detach it from parents. |
|||
|
|||
## rm-data |
|||
|
|||
`vitastor-cli rm-data --pool <pool> --inode <inode> [--wait-list] [--min-offset <offset>]` |
|||
|
|||
Remove inode data without changing metadata. |
|||
|
|||
``` |
|||
--wait-list Retrieve full objects listings before starting to remove objects. |
|||
Requires more memory, but allows to show correct removal progress. |
|||
--min-offset Purge only data starting with specified offset. |
|||
``` |
|||
|
|||
## merge-data |
|||
|
|||
`vitastor-cli merge-data <from> <to> [--target <target>]` |
|||
|
|||
Merge layer data without changing metadata. Merge `<from>`..`<to>` to `<target>`. |
|||
`<to>` must be a child of `<from>` and `<target>` may be one of the layers between |
|||
`<from>` and `<to>`, including `<from>` and `<to>`. |
|||
|
|||
## alloc-osd |
|||
|
|||
`vitastor-cli alloc-osd` |
|||
|
|||
Allocate a new OSD number and reserve it by creating empty `/osd/stats/<n>` key. |
|||
|
|||
## simple-offsets |
|||
|
|||
`vitastor-cli simple-offsets <device>` |
|||
|
|||
Calculate offsets for simple&stupid (no superblock) OSD deployment. |
|||
|
|||
Options: |
|||
|
|||
``` |
|||
--object_size 128k Set blockstore block size |
|||
--bitmap_granularity 4k Set bitmap granularity |
|||
--journal_size 16M Set journal size |
|||
--device_block_size 4k Set device block size |
|||
--journal_offset 0 Set journal offset |
|||
--device_size 0 Set device size |
|||
--format text Result format: json, options, env, or text |
|||
``` |
@ -0,0 +1,20 @@ |
|||
--- |
|||
title: NBD |
|||
weight: 6 |
|||
--- |
|||
|
|||
To create a local block device for a Vitastor image, use NBD. For example: |
|||
|
|||
``` |
|||
vitastor-nbd map --etcd_address 10.115.0.10:2379/v3 --image testimg |
|||
``` |
|||
|
|||
It will output the device name, like /dev/nbd0 which you can then format and mount as a normal block device. |
|||
|
|||
You can also use `--pool <POOL> --inode <INODE> --size <SIZE>` instead of `--image <IMAGE>` if you want. |
|||
|
|||
To unmap the device run: |
|||
|
|||
``` |
|||
vitastor-nbd unmap /dev/nbd0 |
|||
``` |
@ -0,0 +1,39 @@ |
|||
--- |
|||
title: QEMU and qemu-img |
|||
weight: 2 |
|||
--- |
|||
|
|||
You need patched QEMU version to use Vitastor driver. |
|||
|
|||
To start a VM using plain QEMU command-line with Vitastor disk, use the following commands: |
|||
|
|||
Old syntax (-drive): |
|||
|
|||
``` |
|||
qemu-system-x86_64 -enable-kvm -m 1024 \ |
|||
-drive 'file=vitastor:etcd_host=192.168.7.2\:2379/v3:image=debian9', |
|||
format=raw,if=none,id=drive-virtio-disk0,cache=none \ |
|||
-device 'virtio-blk-pci,scsi=off,bus=pci.0,addr=0x5,drive=drive-virtio-disk0, |
|||
id=virtio-disk0,bootindex=1,write-cache=off' \ |
|||
-vnc 0.0.0.0:0 |
|||
``` |
|||
|
|||
New syntax (-blockdev): |
|||
|
|||
``` |
|||
qemu-system-x86_64 -enable-kvm -m 1024 \ |
|||
-blockdev '{"node-name":"drive-virtio-disk0","driver":"vitastor","image":"debian9", |
|||
"cache":{"direct":true,"no-flush":false},"auto-read-only":true,"discard":"unmap"}' \ |
|||
-device 'virtio-blk-pci,scsi=off,bus=pci.0,addr=0x5,drive=drive-virtio-disk0, |
|||
id=virtio-disk0,bootindex=1,write-cache=off' \ |
|||
-vnc 0.0.0.0:0 |
|||
``` |
|||
|
|||
For qemu-img, you should use `vitastor:etcd_host=<HOST>:image=<IMAGE>` as filename. For example: |
|||
|
|||
``` |
|||
qemu-img convert -f qcow2 debian10.qcow2 -p -O raw 'vitastor:etcd_host=192.168.7.2\:2379/v3:image=debian10' |
|||
``` |
|||
|
|||
You can also specify `:pool=<POOL>:inode=<INODE>:size=<SIZE>` instead of `:image=<IMAGE>` |
|||
if you don't want to use inode metadata. |
@ -0,0 +1,37 @@ |
|||
--- |
|||
nav_navigation: Навигация |
|||
nav_tags: Теги |
|||
nav_more: Подробнее |
|||
nav_top: К началу |
|||
|
|||
form_placeholder_search: Поиск |
|||
|
|||
error_page_title: Открыта несуществующая страница |
|||
error_message_title: Потерялись? |
|||
error_message_code: Ошибка 404 |
|||
error_message_text: > |
|||
Похоже, страница, которую вы открыли, не существует. Попробуйте найти |
|||
нужную информацию с <a class="gdoc-error__link" href="{{ . }}">главной страницы</a>. |
|||
|
|||
button_toggle_dark: Переключить тёмный/светлый/авто режим |
|||
button_nav_open: Показать навигацию |
|||
button_nav_close: Скрыть навигацию |
|||
button_menu_open: Открыть меню |
|||
button_menu_close: Закрыть меню |
|||
button_homepage: На главную |
|||
|
|||
title_anchor_prefix: "Ссылка на:" |
|||
|
|||
posts_read_more: Читать подробнее |
|||
posts_read_time: |
|||
one: "Одна минута на чтение" |
|||
other: "{{ . }} минут(ы) на чтение" |
|||
posts_update_prefix: Обновлено |
|||
|
|||
footer_build_with: > |
|||
Сделано на <a href="https://gohugo.io/" class="gdoc-footer__link">Hugo</a> с |
|||
<svg class="icon gdoc_heart"><use xlink:href="#gdoc_heart"></use></svg> |
|||
footer_legal_notice: Правовая информация |
|||
footer_privacy_policy: Приватность |
|||
|
|||
language_switch_no_tranlation_prefix: "Страница не переведена:" |
@ -0,0 +1,34 @@ |
|||
<footer class="gdoc-footer"> |
|||
<div class="container flex"> |
|||
<div class="flex flex-wrap" style="flex: 1"> |
|||
<span class="gdoc-footer__item gdoc-footer__item--row"> |
|||
© Vitaliy Filippov, 2021+ |
|||
</span> |
|||
</div> |
|||
<div class="flex flex-wrap"> |
|||
{{ with .Site.Params.GeekdocLegalNotice }} |
|||
<span class="gdoc-footer__item gdoc-footer__item--row"> |
|||
<a href="{{ . | relURL }}" class="gdoc-footer__link">{{ i18n "footer_legal_notice" }}</a> |
|||
</span> |
|||
{{ end }} |
|||
{{ with .Site.Params.GeekdocPrivacyPolicy }} |
|||
<span class="gdoc-footer__item gdoc-footer__item--row"> |
|||
<a href="{{ . | relURL }}" class="gdoc-footer__link">{{ i18n "footer_privacy_policy" }}</a> |
|||
</span> |
|||
{{ end }} |
|||
</div> |
|||
{{ if (default true .Site.Params.GeekdocBackToTop) }} |
|||
<div class="flex flex-25 justify-end"> |
|||
<span class="gdoc-footer__item gdoc-footer__item--row" style="margin-right: 50px"> |
|||
{{ i18n "footer_build_with" | safeHTML }} |
|||
</span> |
|||
<span class="gdoc-footer__item"> |
|||
<a class="gdoc-footer__link fake-link" href="#" aria-label="{{ i18n "nav_top" }}"> |
|||
<svg class="icon gdoc_keyboard_arrow_up"><use xlink:href="#gdoc_keyboard_arrow_up"></use></svg> |
|||
<span class="hidden-mobile">{{ i18n "nav_top" }}</span> |
|||
</a> |
|||
</span> |
|||
</div> |
|||
{{ end }} |
|||
</div> |
|||
</footer> |
After Width: | Height: | Size: 7.4 KiB |
@ -0,0 +1,138 @@ |
|||
/* Global customization */ |
|||
|
|||
:root { |
|||
--code-max-height: 60rem; |
|||
} |
|||
|
|||
/* Light mode theming */ |
|||
:root, |
|||
:root[color-mode="light"] { |
|||
--header-background: #404050; |
|||
--header-font-color: #ffffff; |
|||
|
|||
--body-background: #ffffff; |
|||
--body-font-color: #343a40; |
|||
|
|||
--button-background: #62cb97; |
|||
--button-border-color: #4ec58a; |
|||
|
|||
--link-color: #c54e8a; |
|||
--link-color-visited: #c54e8a; |
|||
|
|||
--code-background: #f5f6f8; |
|||
--code-accent-color: #e3e7eb; |
|||
--code-accent-color-lite: #eff1f3; |
|||
|
|||
--accent-color: #e9ecef; |
|||
--accent-color-lite: #f8f9fa; |
|||
|
|||
--control-icons: #b2bac1; |
|||
|
|||
--footer-background: #606070; |
|||
--footer-font-color: #ffffff; |
|||
--footer-link-color: #ffcc5c; |
|||
--footer-link-color-visited: #ffcc5c; |
|||
} |
|||
@media (prefers-color-scheme: light) { |
|||
:root { |
|||
--header-background: #404050; |
|||
--header-font-color: #ffffff; |
|||
|
|||
--body-background: #ffffff; |
|||
--body-font-color: #343a40; |
|||
|
|||
--button-background: #62cb97; |
|||
--button-border-color: #4ec58a; |
|||
|
|||
--link-color: #c54e8a; |
|||
--link-color-visited: #c54e8a; |
|||
|
|||
--code-background: #f5f6f8; |
|||
--code-accent-color: #e3e7eb; |
|||
--code-accent-color-lite: #eff1f3; |
|||
|
|||
--accent-color: #e9ecef; |
|||
--accent-color-lite: #f8f9fa; |
|||
|
|||
--control-icons: #b2bac1; |
|||
|
|||
--footer-background: #606070; |
|||
--footer-font-color: #ffffff; |
|||
--footer-link-color: #ffcc5c; |
|||
--footer-link-color-visited: #ffcc5c; |
|||
} |
|||
} |
|||
|
|||
/* Dark mode theming */ |
|||
:root[color-mode="dark"] { |
|||
--header-background: #202830; |
|||
--header-font-color: #ffffff; |
|||
|
|||
--body-background: #343a44; |
|||
--body-font-color: #ced3d8; |
|||
|
|||
--button-background: #62cb97; |
|||
--button-border-color: #4ec58a; |
|||
|
|||
--link-color: #7ac29e; |
|||
--link-color-visited: #7ac29e; |
|||
|
|||
--code-background: #2f353a; |
|||
--code-accent-color: #262b2f; |
|||
--code-accent-color-lite: #2b3035; |
|||
|
|||
--accent-color: #2b3035; |
|||
--accent-color-lite: #2f353a; |
|||
|
|||
--control-icons: #b2bac1; |
|||
|
|||
--footer-background: #2f333e; |
|||
--footer-font-color: #cccccc; |
|||
--footer-link-color: #7ac29e; |
|||
--footer-link-color-visited: #7ac29e; |
|||
} |
|||
@media (prefers-color-scheme: dark) { |
|||
:root { |
|||
--header-background: #404070; |
|||
--header-font-color: #ffffff; |
|||
|
|||
--body-background: #343a40; |
|||
--body-font-color: #ced3d8; |
|||
|
|||
--button-background: #62cb97; |
|||
--button-border-color: #4ec58a; |
|||
|
|||
--link-color: #7ac29e; |
|||
--link-color-visited: #7ac29e; |
|||
|
|||
--code-background: #2f353a; |
|||
--code-accent-color: #262b2f; |
|||
--code-accent-color-lite: #2b3035; |
|||
|
|||
--accent-color: #2b3035; |
|||
--accent-color-lite: #2f353a; |
|||
|
|||
--control-icons: #b2bac1; |
|||
|
|||
--footer-background: #2f333e; |
|||
--footer-font-color: #cccccc; |
|||
--footer-link-color: #7ac29e; |
|||
--footer-link-color-visited: #7ac29e; |
|||
} |
|||
} |
|||
|
|||
.gdoc-brand__img { |
|||
width: 48px; |
|||
height: auto; |
|||
margin-top: -4px; |
|||
margin-bottom: -4px; |
|||
} |
|||
|
|||
.gdoc-menu-header > span { |
|||
display: flex; |
|||
flex-direction: row-reverse; |
|||
} |
|||
|
|||
span.gdoc-language { |
|||
margin-right: 20px; |
|||
} |
After Width: | Height: | Size: 709 B |
After Width: | Height: | Size: 1.5 KiB |