Compare commits

...

4 Commits

Author SHA1 Message Date
Vitaliy Filippov 1f6c4c79d6 vmsplice+splice experiment in stub_osd to test it too 2021-11-22 01:20:12 +03:00
Vitaliy Filippov 4936c42132 Splice via io_uring - bad result too
40% CPU according to perf is lost inside do_splice() -> unix_stream_sendpage()
without io_uring and in various exc_page_fault() with io_uring
2021-11-22 00:13:27 +03:00
Vitaliy Filippov 6c3248a36c Experiment: vmsplice+splice "zero-copy" read in NBD 2021-11-22 00:12:39 +03:00
Vitaliy Filippov a863013cb2 Add a patch for qemu 6.1 and replace _ with - in qemu options 2021-11-21 16:16:46 +03:00
17 changed files with 474 additions and 125 deletions

View File

@ -7,11 +7,11 @@ ARG REL=
WORKDIR /root
RUN if [ "$REL" = "buster" ]; then \
echo 'deb http://deb.debian.org/debian buster-backports main' >> /etc/apt/sources.list; \
RUN if [ "$REL" = "buster" -o "$REL" = "bullseye" ]; then \
echo "deb http://deb.debian.org/debian $REL-backports main" >> /etc/apt/sources.list; \
echo >> /etc/apt/preferences; \
echo 'Package: *' >> /etc/apt/preferences; \
echo 'Pin: release a=buster-backports' >> /etc/apt/preferences; \
echo "Pin: release a=$REL-backports" >> /etc/apt/preferences; \
echo 'Pin-Priority: 500' >> /etc/apt/preferences; \
fi; \
grep '^deb ' /etc/apt/sources.list | perl -pe 's/^deb/deb-src/' >> /etc/apt/sources.list; \
@ -27,15 +27,20 @@ RUN apt-get -y build-dep fio
RUN apt-get --download-only source qemu
RUN apt-get --download-only source fio
ADD patches/qemu-5.0-vitastor.patch patches/qemu-5.1-vitastor.patch /root/vitastor/patches/
ADD patches/qemu-5.0-vitastor.patch patches/qemu-5.1-vitastor.patch patches/qemu-6.1-vitastor.patch /root/vitastor/patches/
RUN set -e; \
mkdir -p /root/packages/qemu-$REL; \
rm -rf /root/packages/qemu-$REL/*; \
cd /root/packages/qemu-$REL; \
dpkg-source -x /root/qemu*.dsc; \
if [ -d /root/packages/qemu-$REL/qemu-5.0 ]; then \
cp /root/vitastor/patches/qemu-5.0-vitastor.patch /root/packages/qemu-$REL/qemu-5.0/debian/patches; \
echo qemu-5.0-vitastor.patch >> /root/packages/qemu-$REL/qemu-5.0/debian/patches/series; \
if ls -d /root/packages/qemu-$REL/qemu-5.0*; then \
D=$(ls -d /root/packages/qemu-$REL/qemu-5.0*); \
cp /root/vitastor/patches/qemu-5.0-vitastor.patch $D/debian/patches; \
echo qemu-5.0-vitastor.patch >> $D/debian/patches/series; \
elif ls /root/packages/qemu-$REL/qemu-6.1*; then \
D=$(ls -d /root/packages/qemu-$REL/qemu-6.1*); \
cp /root/vitastor/patches/qemu-6.1-vitastor.patch $D/debian/patches; \
echo qemu-6.1-vitastor.patch >> $D/debian/patches/series; \
else \
cp /root/vitastor/patches/qemu-5.1-vitastor.patch /root/packages/qemu-$REL/qemu-*/debian/patches; \
P=`ls -d /root/packages/qemu-$REL/qemu-*/debian/patches`; \

View File

@ -489,7 +489,7 @@ class VitastorDriver(driver.CloneableImageVD,
# FIXME use etcd_address in qemu driver
kk = 'etcd_host'
if v:
args += ':'+kk+'='+v.replace(':', '\\:')
args += ':'+kk.replace('_', '-')+'='+v.replace(':', '\\:')
return args
def delete_volume(self, volume):

View File

@ -245,9 +245,9 @@ index cbf0aa4..096700d 100644
+
+ if (virJSONValueObjectCreate(&ret,
+ "s:driver", "vitastor",
+ "S:etcd_host", etcd,
+ "S:etcd_prefix", src->relPath,
+ "S:config_path", src->configFile,
+ "S:etcd-host", etcd,
+ "S:etcd-prefix", src->relPath,
+ "S:config-path", src->configFile,
+ "s:image", src->path,
+ NULL) < 0)
+ goto cleanup;
@ -293,7 +293,7 @@ index 822d5f8..e375cef 100644
+ virBufferStrcat(&buf, "vitastor:image=", src->path, NULL);
+
+ if (src->nhosts > 0) {
+ virBufferAddLit(&buf, ":etcd_host=");
+ virBufferAddLit(&buf, ":etcd-host=");
+ for (i = 0; i < src->nhosts; i++) {
+ if (i)
+ virBufferAddLit(&buf, ",");
@ -311,10 +311,10 @@ index 822d5f8..e375cef 100644
+ }
+
+ if (src->configFile)
+ virBufferEscape(&buf, '\\', ":", ":config_path=%s", src->configFile);
+ virBufferEscape(&buf, '\\', ":", ":config-path=%s", src->configFile);
+
+ if (src->relPath)
+ virBufferEscape(&buf, '\\', ":", ":etcd_prefix=%s", src->relPath);
+ virBufferEscape(&buf, '\\', ":", ":etcd-prefix=%s", src->relPath);
+
+ ret = virBufferContentAndReset(&buf);
+ break;
@ -438,16 +438,16 @@ index bd4b027..b323cd6 100644
+ if (STRPREFIX(p, "image=")) {
+ if (VIR_STRDUP(src->path, p + strlen("image=")) < 0)
+ return -1;
+ } else if (STRPREFIX(p, "etcd_prefix=")) {
+ if (VIR_STRDUP(src->relPath, p + strlen("etcd_prefix=")) < 0)
+ } else if (STRPREFIX(p, "etcd-prefix=")) {
+ if (VIR_STRDUP(src->relPath, p + strlen("etcd-prefix=")) < 0)
+ return -1;
+ } else if (STRPREFIX(p, "config_file=")) {
+ if (VIR_STRDUP(src->configFile, p + strlen("config_file=")) < 0)
+ } else if (STRPREFIX(p, "config-path=")) {
+ if (VIR_STRDUP(src->configFile, p + strlen("config-path=")) < 0)
+ return -1;
+ } else if (STRPREFIX(p, "etcd_host=")) {
+ } else if (STRPREFIX(p, "etcd-host=")) {
+ char *h, *sep;
+
+ h = p + strlen("etcd_host=");
+ h = p + strlen("etcd-host=");
+ while (h < e) {
+ for (sep = h; sep < e; ++sep) {
+ if (*sep == '\\' && (sep[1] == ',' ||
@ -507,8 +507,8 @@ index bd4b027..b323cd6 100644
+{
+ const char *filename;
+ const char *image = virJSONValueObjectGetString(json, "image");
+ const char *conf = virJSONValueObjectGetString(json, "config_path");
+ const char *etcd_prefix = virJSONValueObjectGetString(json, "etcd_prefix");
+ const char *conf = virJSONValueObjectGetString(json, "config-path");
+ const char *etcd_prefix = virJSONValueObjectGetString(json, "etcd-prefix");
+ virJSONValuePtr servers = virJSONValueObjectGetArray(json, "server");
+ size_t nservers;
+ size_t i;

View File

@ -244,9 +244,9 @@ index f9c6da2..922dde5 100644
+ }
+
+ if (virJSONValueObjectCreate(&ret,
+ "S:etcd_host", etcd,
+ "S:etcd_prefix", src->query,
+ "S:config_path", src->configFile,
+ "S:etcd-host", etcd,
+ "S:etcd-prefix", src->query,
+ "S:config-path", src->configFile,
+ "s:image", src->path,
+ NULL) < 0)
+ return NULL;
@ -311,7 +311,7 @@ index 6f970a3..10b39ca 100644
+ virBufferStrcat(&buf, "vitastor:image=", src->path, NULL);
+
+ if (src->nhosts > 0) {
+ virBufferAddLit(&buf, ":etcd_host=");
+ virBufferAddLit(&buf, ":etcd-host=");
+ for (i = 0; i < src->nhosts; i++) {
+ if (i)
+ virBufferAddLit(&buf, ",");
@ -329,10 +329,10 @@ index 6f970a3..10b39ca 100644
+ }
+
+ if (src->configFile)
+ virBufferEscape(&buf, '\\', ":", ":config_path=%s", src->configFile);
+ virBufferEscape(&buf, '\\', ":", ":config-path=%s", src->configFile);
+
+ if (src->query)
+ virBufferEscape(&buf, '\\', ":", ":etcd_prefix=%s", src->query);
+ virBufferEscape(&buf, '\\', ":", ":etcd-prefix=%s", src->query);
+
+ ret = virBufferContentAndReset(&buf);
+ break;
@ -462,14 +462,14 @@ index 0d3c2af..36e3afc 100644
+
+ if (STRPREFIX(p, "image=")) {
+ src->path = g_strdup(p + strlen("image="));
+ } else if (STRPREFIX(p, "etcd_prefix=")) {
+ src->query = g_strdup(p + strlen("etcd_prefix="));
+ } else if (STRPREFIX(p, "config_file=")) {
+ src->configFile = g_strdup(p + strlen("config_file="));
+ } else if (STRPREFIX(p, "etcd_host=")) {
+ } else if (STRPREFIX(p, "etcd-prefix=")) {
+ src->query = g_strdup(p + strlen("etcd-prefix="));
+ } else if (STRPREFIX(p, "config-path=")) {
+ src->configFile = g_strdup(p + strlen("config-path="));
+ } else if (STRPREFIX(p, "etcd-host=")) {
+ char *h, *sep;
+
+ h = p + strlen("etcd_host=");
+ h = p + strlen("etcd-host=");
+ while (h < e) {
+ for (sep = h; sep < e; ++sep) {
+ if (*sep == '\\' && (sep[1] == ',' ||
@ -526,8 +526,8 @@ index 0d3c2af..36e3afc 100644
+{
+ const char *filename;
+ const char *image = virJSONValueObjectGetString(json, "image");
+ const char *conf = virJSONValueObjectGetString(json, "config_path");
+ const char *etcd_prefix = virJSONValueObjectGetString(json, "etcd_prefix");
+ const char *conf = virJSONValueObjectGetString(json, "config-path");
+ const char *etcd_prefix = virJSONValueObjectGetString(json, "etcd-prefix");
+ virJSONValuePtr servers = virJSONValueObjectGetArray(json, "server");
+ size_t nservers;
+ size_t i;

View File

@ -276,9 +276,9 @@ index 6627d04..c33f428 100644
+ }
+
+ if (virJSONValueObjectCreate(&ret,
+ "S:etcd_host", etcd,
+ "S:etcd_prefix", src->query,
+ "S:config_path", src->configFile,
+ "S:etcd-host", etcd,
+ "S:etcd-prefix", src->query,
+ "S:config-path", src->configFile,
+ "s:image", src->path,
+ NULL) < 0)
+ return NULL;
@ -343,7 +343,7 @@ index ea51369..8258632 100644
+ virBufferStrcat(&buf, "vitastor:image=", src->path, NULL);
+
+ if (src->nhosts > 0) {
+ virBufferAddLit(&buf, ":etcd_host=");
+ virBufferAddLit(&buf, ":etcd-host=");
+ for (i = 0; i < src->nhosts; i++) {
+ if (i)
+ virBufferAddLit(&buf, ",");
@ -361,10 +361,10 @@ index ea51369..8258632 100644
+ }
+
+ if (src->configFile)
+ virBufferEscape(&buf, '\\', ":", ":config_path=%s", src->configFile);
+ virBufferEscape(&buf, '\\', ":", ":config-path=%s", src->configFile);
+
+ if (src->query)
+ virBufferEscape(&buf, '\\', ":", ":etcd_prefix=%s", src->query);
+ virBufferEscape(&buf, '\\', ":", ":etcd-prefix=%s", src->query);
+
+ ret = virBufferContentAndReset(&buf);
+ break;
@ -474,14 +474,14 @@ index e48ae72..d7a9b72 100644
+
+ if (STRPREFIX(p, "image=")) {
+ src->path = g_strdup(p + strlen("image="));
+ } else if (STRPREFIX(p, "etcd_prefix=")) {
+ src->query = g_strdup(p + strlen("etcd_prefix="));
+ } else if (STRPREFIX(p, "config_file=")) {
+ src->configFile = g_strdup(p + strlen("config_file="));
+ } else if (STRPREFIX(p, "etcd_host=")) {
+ } else if (STRPREFIX(p, "etcd-prefix=")) {
+ src->query = g_strdup(p + strlen("etcd-prefix="));
+ } else if (STRPREFIX(p, "config-path=")) {
+ src->configFile = g_strdup(p + strlen("config-path="));
+ } else if (STRPREFIX(p, "etcd-host=")) {
+ char *h, *sep;
+
+ h = p + strlen("etcd_host=");
+ h = p + strlen("etcd-host=");
+ while (h < e) {
+ for (sep = h; sep < e; ++sep) {
+ if (*sep == '\\' && (sep[1] == ',' ||
@ -538,8 +538,8 @@ index e48ae72..d7a9b72 100644
+{
+ const char *filename;
+ const char *image = virJSONValueObjectGetString(json, "image");
+ const char *conf = virJSONValueObjectGetString(json, "config_path");
+ const char *etcd_prefix = virJSONValueObjectGetString(json, "etcd_prefix");
+ const char *conf = virJSONValueObjectGetString(json, "config-path");
+ const char *etcd_prefix = virJSONValueObjectGetString(json, "etcd-prefix");
+ virJSONValue *servers = virJSONValueObjectGetArray(json, "server");
+ size_t nservers;
+ size_t i;

View File

@ -124,7 +124,7 @@ index 391231c527..34dc60dcdd 100644
+ if kk == 'etcd_address':
+ # FIXME use etcd_address in qemu driver
+ kk = 'etcd_host'
+ path += ":"+kk+"="+connection_info['data'][k].replace(':', '\\:')
+ path += ":"+kk.replace('_', '-')+"="+connection_info['data'][k].replace(':', '\\:')
else:
path = 'unknown'
raise exception.DiskNotFound(location='unknown')

View File

@ -23,18 +23,18 @@ Index: qemu-3.1+dfsg/qapi/block-core.json
+# @inode: Inode number
+# @pool: Pool ID
+# @size: Desired image size in bytes
+# @config_path: Path to Vitastor configuration
+# @etcd_host: etcd connection address(es)
+# @etcd_prefix: etcd key/value prefix
+# @config-path: Path to Vitastor configuration
+# @etcd-host: etcd connection address(es)
+# @etcd-prefix: etcd key/value prefix
+##
+{ 'struct': 'BlockdevOptionsVitastor',
+ 'data': { '*inode': 'uint64',
+ '*pool': 'uint64',
+ '*size': 'uint64',
+ '*image': 'str',
+ '*config_path': 'str',
+ '*etcd_host': 'str',
+ '*etcd_prefix': 'str' } }
+ '*config-path': 'str',
+ '*etcd-host': 'str',
+ '*etcd-prefix': 'str' } }
+
+##
# @ReplicationMode:

View File

@ -23,18 +23,18 @@ Index: qemu/qapi/block-core.json
+# @inode: Inode number
+# @pool: Pool ID
+# @size: Desired image size in bytes
+# @config_path: Path to Vitastor configuration
+# @etcd_host: etcd connection address(es)
+# @etcd_prefix: etcd key/value prefix
+# @config-path: Path to Vitastor configuration
+# @etcd-host: etcd connection address(es)
+# @etcd-prefix: etcd key/value prefix
+##
+{ 'struct': 'BlockdevOptionsVitastor',
+ 'data': { '*inode': 'uint64',
+ '*pool': 'uint64',
+ '*size': 'uint64',
+ '*image': 'str',
+ '*config_path': 'str',
+ '*etcd_host': 'str',
+ '*etcd_prefix': 'str' } }
+ '*config-path': 'str',
+ '*etcd-host': 'str',
+ '*etcd-prefix': 'str' } }
+
+##
# @ReplicationMode:

View File

@ -23,18 +23,18 @@ Index: qemu/qapi/block-core.json
+# @inode: Inode number
+# @pool: Pool ID
+# @size: Desired image size in bytes
+# @config_path: Path to Vitastor configuration
+# @etcd_host: etcd connection address(es)
+# @etcd_prefix: etcd key/value prefix
+# @config-path: Path to Vitastor configuration
+# @etcd-host: etcd connection address(es)
+# @etcd-prefix: etcd key/value prefix
+##
+{ 'struct': 'BlockdevOptionsVitastor',
+ 'data': { '*inode': 'uint64',
+ '*pool': 'uint64',
+ '*size': 'uint64',
+ '*image': 'str',
+ '*config_path': 'str',
+ '*etcd_host': 'str',
+ '*etcd_prefix': 'str' } }
+ '*config-path': 'str',
+ '*etcd-host': 'str',
+ '*etcd-prefix': 'str' } }
+
+##
# @ReplicationMode:

View File

@ -23,18 +23,18 @@ Index: qemu-5.1+dfsg/qapi/block-core.json
+# @inode: Inode number
+# @pool: Pool ID
+# @size: Desired image size in bytes
+# @config_path: Path to Vitastor configuration
+# @etcd_host: etcd connection address(es)
+# @etcd_prefix: etcd key/value prefix
+# @config-path: Path to Vitastor configuration
+# @etcd-host: etcd connection address(es)
+# @etcd-prefix: etcd key/value prefix
+##
+{ 'struct': 'BlockdevOptionsVitastor',
+ 'data': { '*inode': 'uint64',
+ '*pool': 'uint64',
+ '*size': 'uint64',
+ '*image': 'str',
+ '*config_path': 'str',
+ '*etcd_host': 'str',
+ '*etcd_prefix': 'str' } }
+ '*config-path': 'str',
+ '*etcd-host': 'str',
+ '*etcd-prefix': 'str' } }
+
+##
# @ReplicationMode:

View File

@ -0,0 +1,88 @@
Index: qemu-6.1+dfsg/qapi/block-core.json
===================================================================
--- qemu-6.1+dfsg.orig/qapi/block-core.json
+++ qemu-6.1+dfsg/qapi/block-core.json
@@ -2838,7 +2838,7 @@
'luks', 'nbd', 'nfs', 'null-aio', 'null-co', 'nvme', 'parallels',
'preallocate', 'qcow', 'qcow2', 'qed', 'quorum', 'raw', 'rbd',
{ 'name': 'replication', 'if': 'defined(CONFIG_REPLICATION)' },
- 'ssh', 'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat' ] }
+ 'ssh', 'throttle', 'vdi', 'vhdx', 'vitastor', 'vmdk', 'vpc', 'vvfat' ] }
##
# @BlockdevOptionsFile:
@@ -3763,6 +3763,28 @@
'*server': ['InetSocketAddressBase'] } }
##
+# @BlockdevOptionsVitastor:
+#
+# Driver specific block device options for vitastor
+#
+# @image: Image name
+# @inode: Inode number
+# @pool: Pool ID
+# @size: Desired image size in bytes
+# @config-path: Path to Vitastor configuration
+# @etcd-host: etcd connection address(es)
+# @etcd-prefix: etcd key/value prefix
+##
+{ 'struct': 'BlockdevOptionsVitastor',
+ 'data': { '*inode': 'uint64',
+ '*pool': 'uint64',
+ '*size': 'uint64',
+ '*image': 'str',
+ '*config-path': 'str',
+ '*etcd-host': 'str',
+ '*etcd-prefix': 'str' } }
+
+##
# @ReplicationMode:
#
# An enumeration of replication modes.
@@ -4134,6 +4156,7 @@
'throttle': 'BlockdevOptionsThrottle',
'vdi': 'BlockdevOptionsGenericFormat',
'vhdx': 'BlockdevOptionsGenericFormat',
+ 'vitastor': 'BlockdevOptionsVitastor',
'vmdk': 'BlockdevOptionsGenericCOWFormat',
'vpc': 'BlockdevOptionsGenericFormat',
'vvfat': 'BlockdevOptionsVVFAT'
@@ -4523,6 +4546,17 @@
'*encrypt' : 'RbdEncryptionCreateOptions' } }
##
+# @BlockdevCreateOptionsVitastor:
+#
+# Driver specific image creation options for Vitastor.
+#
+# @size: Size of the virtual disk in bytes
+##
+{ 'struct': 'BlockdevCreateOptionsVitastor',
+ 'data': { 'location': 'BlockdevOptionsVitastor',
+ 'size': 'size' } }
+
+##
# @BlockdevVmdkSubformat:
#
# Subformat options for VMDK images
@@ -4718,6 +4752,7 @@
'ssh': 'BlockdevCreateOptionsSsh',
'vdi': 'BlockdevCreateOptionsVdi',
'vhdx': 'BlockdevCreateOptionsVhdx',
+ 'vitastor': 'BlockdevCreateOptionsVitastor',
'vmdk': 'BlockdevCreateOptionsVmdk',
'vpc': 'BlockdevCreateOptionsVpc'
} }
Index: qemu-6.1+dfsg/scripts/modules/module_block.py
===================================================================
--- qemu-6.1+dfsg.orig/scripts/modules/module_block.py
+++ qemu-6.1+dfsg/scripts/modules/module_block.py
@@ -86,6 +86,7 @@ if __name__ == '__main__':
output_file = sys.argv[1]
with open(output_file, 'w') as fheader:
print_top(fheader)
+ add_module(fheader, "vitastor", "vitastor", "vitastor")
for filename in sys.argv[2:]:
if os.path.isfile(filename):

View File

@ -145,7 +145,7 @@ endif (${WITH_FIO})
# vitastor-nbd
add_executable(vitastor-nbd
nbd_proxy.cpp
nbd_proxy.cpp mmap_manager.cpp
)
target_link_libraries(vitastor-nbd
vitastor_client
@ -188,7 +188,7 @@ endif (${WITH_QEMU})
### Test stubs
# stub_osd, stub_bench, osd_test
add_executable(stub_osd stub_osd.cpp rw_blocking.cpp)
add_executable(stub_osd stub_osd.cpp rw_blocking.cpp mmap_manager.cpp)
target_link_libraries(stub_osd tcmalloc_minimal)
add_executable(stub_bench stub_bench.cpp rw_blocking.cpp)
target_link_libraries(stub_bench tcmalloc_minimal)

82
src/mmap_manager.cpp Normal file
View File

@ -0,0 +1,82 @@
#include <stdexcept>
#include <cassert>
#include <sys/mman.h>
#include "mmap_manager.h"
mmap_manager_t::mmap_manager_t(uint64_t mmap_size)
{
this->mmap_size = mmap_size;
}
mmap_manager_t::~mmap_manager_t()
{
for (auto & kv: past_buffers)
{
munmap(kv.second.addr, kv.second.size);
}
if (active_buffer.addr != NULL)
{
munmap(active_buffer.addr, active_buffer.size);
}
}
void *mmap_manager_t::alloc(uint64_t size)
{
if (!active_buffer.addr || (active_buffer.pos + size) > active_buffer.size)
{
if (active_buffer.addr)
{
if (active_buffer.freed >= active_buffer.pos)
munmap(active_buffer.addr, active_buffer.size);
else
past_buffers[active_buffer.addr] = active_buffer;
active_buffer = { 0 };
}
uint64_t new_size = size < mmap_size ? mmap_size : size;
void *buf = mmap(NULL, new_size, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
if (!buf)
throw std::runtime_error(std::string("can't mmap "+std::to_string(new_size)+" bytes"));
active_buffer = {
.addr = buf,
.size = new_size,
.freed = 0,
.pos = 0,
};
}
void *res = active_buffer.addr + active_buffer.pos;
active_buffer.pos += size;
return res;
}
void mmap_manager_t::free(void *addr, uint64_t size)
{
auto it = past_buffers.upper_bound(addr);
if (it != past_buffers.begin())
{
if (it == past_buffers.end())
{
it--;
if (addr < it->second.addr || addr >= it->second.addr+it->second.size)
it = past_buffers.end();
}
else
it--;
}
else
it = past_buffers.end();
if (it != past_buffers.end())
{
assert(addr >= it->second.addr && addr+size <= it->second.addr+it->second.size);
it->second.freed += size;
if (it->second.freed >= it->second.pos)
{
munmap(it->second.addr, it->second.size);
past_buffers.erase(it);
}
}
else
{
assert(addr < active_buffer.addr+active_buffer.size);
active_buffer.freed += size;
}
}

26
src/mmap_manager.h Normal file
View File

@ -0,0 +1,26 @@
#pragma once
#include <stdint.h>
#include <map>
struct mmap_buffer_t
{
void *addr = NULL;
uint64_t size = 0;
uint64_t freed = 0;
uint64_t pos = 0;
};
class mmap_manager_t
{
protected:
uint64_t mmap_size = 32*1024*1024;
std::map<void*, mmap_buffer_t> past_buffers;
mmap_buffer_t active_buffer;
public:
mmap_manager_t(uint64_t mmap_size = 32*1024*1024);
~mmap_manager_t();
void *alloc(uint64_t size);
void free(void *addr, uint64_t size);
};

View File

@ -17,6 +17,8 @@
#include "epoll_manager.h"
#include "cluster_client.h"
#include "mmap_manager.h"
#include <stdexcept>
#ifndef MSG_ZEROCOPY
#define MSG_ZEROCOPY 0
@ -24,6 +26,24 @@
const char *exe_name = NULL;
static inline void my_uring_prep_splice(struct io_uring_sqe *sqe,
int fd_in, int64_t off_in,
int fd_out, int64_t off_out,
unsigned int nbytes,
unsigned int splice_flags)
{
my_uring_prep_rw(IORING_OP_SPLICE, sqe, fd_out, NULL, nbytes, (__u64) off_out);
sqe->splice_off_in = (__u64) off_in;
sqe->splice_fd_in = fd_in;
sqe->splice_flags = splice_flags;
}
struct buf_to_free_t
{
void *buf = NULL;
uint64_t unmap = 0;
};
class nbd_proxy
{
protected:
@ -38,7 +58,7 @@ protected:
ring_consumer_t consumer;
std::vector<iovec> send_list, next_send_list;
std::vector<void*> to_free;
std::vector<buf_to_free_t> to_free;
int nbd_fd = -1;
void *recv_buf = NULL;
int receive_buffer_size = 9000;
@ -51,6 +71,10 @@ protected:
msghdr read_msg = { 0 }, send_msg = { 0 };
iovec read_iov = { 0 };
mmap_manager_t mm;
int pipe_fd[2];
int vmspliced = 0;
public:
static json11::Json::object parse_args(int narg, const char *args[])
{
@ -174,6 +198,12 @@ public:
exit(1);
}
}
// Create pipe for splicing
if (pipe(pipe_fd) < 0)
{
fprintf(stderr, "pipe failed: %s\n", strerror(errno));
exit(1);
}
// Create client
ringloop = new ring_loop_t(512);
epmgr = new epoll_manager_t(ringloop);
@ -522,16 +552,76 @@ protected:
{
return;
}
io_uring_sqe* sqe = ringloop->get_sqe();
int i;
//uint64_t len = 0;
for (i = 0; i < send_list.size(); i++)
{
if (to_free[i].unmap)
{
break;
}
//len += send_list[i].iov_len;
}
//if (true)
if (i > 0)
{
/*io_uring_sqe* sqe = ringloop->get_sqe();
if (!sqe)
{
return;
}
ring_data_t* data = ((ring_data_t*)sqe->user_data);
data->callback = [this](ring_data_t *data) { handle_send(data->res); };
data->callback = [this](ring_data_t *data) { handle_send(data->res); };*/
send_msg.msg_iov = send_list.data();
send_msg.msg_iovlen = send_list.size();
my_uring_prep_sendmsg(sqe, nbd_fd, &send_msg, MSG_ZEROCOPY);
//send_msg.msg_iovlen = send_list.size();
send_msg.msg_iovlen = i;
//my_uring_prep_sendmsg(sqe, nbd_fd, &send_msg, MSG_ZEROCOPY);
int res = sendmsg(nbd_fd, &send_msg, MSG_ZEROCOPY);
if (res < 0)
res = -errno;
handle_send(res);
//int r = sendmsg(int sockfd, const struct msghdr *msg, int flags);
}
else
{
io_uring_sqe* sqe = ringloop->get_sqe();
if (!sqe)
{
return;
}
if (vmspliced <= 0)
{
vmspliced = vmsplice(pipe_fd[1], send_list.data(), 1, SPLICE_F_GIFT);
if (vmspliced < 0)
{
throw std::runtime_error(std::string("vmsplice: ")+strerror(errno));
}
}
send_msg.msg_iovlen = 1;
ring_data_t* data = ((ring_data_t*)sqe->user_data);
data->callback = [this](ring_data_t *data)
{
if (data->res > 0)
vmspliced -= data->res;
handle_send(data->res);
};
my_uring_prep_splice(sqe, pipe_fd[0], -1l, nbd_fd, -1l, vmspliced, SPLICE_F_MOVE);
/*int sent = res, spl = res;
while (spl > 0)
{
res = splice(pipe_fd[0], NULL, nbd_fd, NULL, spl, SPLICE_F_MOVE);
if (res < 0)
{
if (errno != EAGAIN)
throw std::runtime_error(std::string("splice: ")+strerror(errno));
}
else
{
spl -= res;
}
}
handle_send(sent);*/
}
}
void handle_send(int result)
@ -547,7 +637,10 @@ protected:
{
if (result >= send_list[to_eat].iov_len)
{
free(to_free[to_eat]);
if (to_free[to_eat].unmap)
mm.free(to_free[to_eat].buf, to_free[to_eat].unmap);
else
free(to_free[to_eat].buf);
result -= send_list[to_eat].iov_len;
to_eat++;
}
@ -659,6 +752,7 @@ protected:
printf("request %lx +%x %lx\n", be64toh(cur_req.from), be32toh(cur_req.len), handle);
#endif
void *buf = NULL;
nbd_reply *reply = NULL;
cluster_op_t *op = new cluster_op_t;
if (req_type == NBD_CMD_READ || req_type == NBD_CMD_WRITE)
{
@ -666,36 +760,51 @@ protected:
op->inode = inode ? inode : watch->cfg.num;
op->offset = be64toh(cur_req.from);
op->len = be32toh(cur_req.len);
if (req_type == NBD_CMD_WRITE)
{
buf = malloc_or_die(sizeof(nbd_reply) + op->len);
reply = (nbd_reply*)buf;
op->iov.push_back(buf + sizeof(nbd_reply), op->len);
}
else
{
buf = mm.alloc(op->len);
reply = (nbd_reply*)malloc_or_die(sizeof(nbd_reply));
op->iov.push_back(buf, op->len);
}
}
else if (req_type == NBD_CMD_FLUSH)
{
op->opcode = OSD_OP_SYNC;
buf = malloc_or_die(sizeof(nbd_reply));
reply = (nbd_reply*)malloc_or_die(sizeof(nbd_reply));
}
op->callback = [this, buf, handle](cluster_op_t *op)
op->callback = [this, buf, reply, handle](cluster_op_t *op)
{
#ifdef DEBUG
printf("reply %lx e=%d\n", handle, op->retval);
#endif
nbd_reply *reply = (nbd_reply*)buf;
reply->magic = htobe32(NBD_REPLY_MAGIC);
memcpy(reply->handle, &handle, 8);
reply->error = htobe32(op->retval < 0 ? -op->retval : 0);
auto & to_list = send_msg.msg_iovlen > 0 ? next_send_list : send_list;
if (op->retval < 0 || op->opcode != OSD_OP_READ)
to_list.push_back({ .iov_base = buf, .iov_len = sizeof(nbd_reply) });
else
to_list.push_back({ .iov_base = buf, .iov_len = sizeof(nbd_reply) + op->len });
to_free.push_back(buf);
to_list.push_back((iovec){ .iov_base = reply, .iov_len = sizeof(nbd_reply) });
to_free.push_back((buf_to_free_t){ .buf = reply, .unmap = 0 });
if (op->retval >= 0 && op->opcode == OSD_OP_READ)
{
to_list.push_back((iovec){ .iov_base = buf, .iov_len = op->len });
to_free.push_back((buf_to_free_t){ .buf = buf, .unmap = op->len });
}
else if (op->opcode == OSD_OP_READ)
{
mm.free(buf, op->len);
}
delete op;
ringloop->wakeup();
};
if (req_type == NBD_CMD_WRITE)
{
cur_op = op;
cur_buf = buf + sizeof(nbd_reply);
cur_buf = buf;
cur_left = op->len;
read_state = CL_READ_DATA;
}

View File

@ -132,16 +132,19 @@ static void vitastor_parse_filename(const char *filename, QDict *options, Error
error_setg(errp, "conf option %s has no value", name);
break;
}
for (int i = 0; i < strlen(name); i++)
if (name[i] == '_')
name[i] = '-';
qemu_vitastor_unescape(name);
value = qemu_vitastor_next_tok(p, ':', &p);
qemu_vitastor_unescape(value);
if (!strcmp(name, "inode") ||
!strcmp(name, "pool") ||
!strcmp(name, "size") ||
!strcmp(name, "use_rdma") ||
!strcmp(name, "rdma_port_num") ||
!strcmp(name, "rdma_gid_index") ||
!strcmp(name, "rdma_mtu"))
!strcmp(name, "use-rdma") ||
!strcmp(name, "rdma-port_num") ||
!strcmp(name, "rdma-gid-index") ||
!strcmp(name, "rdma-mtu"))
{
unsigned long long num_val;
if (parse_uint_full(value, &num_val, 0))
@ -202,15 +205,15 @@ static int vitastor_file_open(BlockDriverState *bs, QDict *options, int flags, E
VitastorClient *client = bs->opaque;
int64_t ret = 0;
qemu_mutex_init(&client->mutex);
client->config_path = g_strdup(qdict_get_try_str(options, "config_path"));
client->config_path = g_strdup(qdict_get_try_str(options, "config-path"));
// FIXME: Rename to etcd_address
client->etcd_host = g_strdup(qdict_get_try_str(options, "etcd_host"));
client->etcd_prefix = g_strdup(qdict_get_try_str(options, "etcd_prefix"));
client->use_rdma = qdict_get_try_int(options, "use_rdma", -1);
client->rdma_device = g_strdup(qdict_get_try_str(options, "rdma_device"));
client->rdma_port_num = qdict_get_try_int(options, "rdma_port_num", 0);
client->rdma_gid_index = qdict_get_try_int(options, "rdma_gid_index", 0);
client->rdma_mtu = qdict_get_try_int(options, "rdma_mtu", 0);
client->etcd_host = g_strdup(qdict_get_try_str(options, "etcd-host"));
client->etcd_prefix = g_strdup(qdict_get_try_str(options, "etcd-prefix"));
client->use_rdma = qdict_get_try_int(options, "use-rdma", -1);
client->rdma_device = g_strdup(qdict_get_try_str(options, "rdma-device"));
client->rdma_port_num = qdict_get_try_int(options, "rdma-port-num", 0);
client->rdma_gid_index = qdict_get_try_int(options, "rdma-gid-index", 0);
client->rdma_mtu = qdict_get_try_int(options, "rdma-mtu", 0);
client->proxy = vitastor_c_create_qemu(
(QEMUSetFDHandler*)aio_set_fd_handler, bdrv_get_aio_context(bs), client->config_path, client->etcd_host, client->etcd_prefix,
client->use_rdma, client->rdma_device, client->rdma_port_num, client->rdma_gid_index, client->rdma_mtu, 0
@ -264,14 +267,14 @@ static int vitastor_file_open(BlockDriverState *bs, QDict *options, int flags, E
}
bs->total_sectors = client->size / BDRV_SECTOR_SIZE;
//client->aio_context = bdrv_get_aio_context(bs);
qdict_del(options, "use_rdma");
qdict_del(options, "rdma_mtu");
qdict_del(options, "rdma_gid_index");
qdict_del(options, "rdma_port_num");
qdict_del(options, "rdma_device");
qdict_del(options, "config_path");
qdict_del(options, "etcd_host");
qdict_del(options, "etcd_prefix");
qdict_del(options, "use-rdma");
qdict_del(options, "rdma-mtu");
qdict_del(options, "rdma-gid-index");
qdict_del(options, "rdma-port-num");
qdict_del(options, "rdma-device");
qdict_del(options, "config-path");
qdict_del(options, "etcd-host");
qdict_del(options, "etcd-prefix");
qdict_del(options, "image");
qdict_del(options, "inode");
qdict_del(options, "pool");
@ -515,9 +518,9 @@ static QEMUOptionParameter vitastor_create_opts[] = {
static const char *vitastor_strong_runtime_opts[] = {
"inode",
"pool",
"config_path",
"etcd_host",
"etcd_prefix",
"config-path",
"etcd-host",
"etcd-prefix",
NULL
};

View File

@ -37,6 +37,7 @@
#include <stdexcept>
#include "mmap_manager.h"
#include "rw_blocking.h"
#include "osd_ops.h"
@ -115,6 +116,12 @@ int bind_stub(const char *bind_address, int bind_port)
void run_stub(int peer_fd)
{
mmap_manager_t mm;
int pipe_fd[2];
if (pipe(pipe_fd) < 0)
{
throw std::runtime_error(std::string("pipe: ") + strerror(errno));
}
osd_any_op_t op;
osd_any_reply_t reply;
void *buf = NULL;
@ -136,11 +143,39 @@ void run_stub(int peer_fd)
if (op.hdr.opcode == OSD_OP_SEC_READ)
{
reply.hdr.retval = op.sec_rw.len;
buf = malloc(op.sec_rw.len);
//buf = malloc(op.sec_rw.len);
buf = mm.alloc(op.sec_rw.len);
r = write_blocking(peer_fd, reply.buf, OSD_PACKET_SIZE);
if (r == OSD_PACKET_SIZE)
r = write_blocking(peer_fd, &buf, op.sec_rw.len);
free(buf);
{
size_t offset = 0;
while (offset < op.sec_rw.len)
{
iovec iov = { .iov_base = buf+offset, .iov_len = op.sec_rw.len-offset };
int vmspliced = vmsplice(pipe_fd[1], &iov, 1, SPLICE_F_GIFT);
if (vmspliced < 0)
{
throw std::runtime_error(std::string("vmsplice: ")+strerror(errno));
}
int spliced = 0;
while (spliced < vmspliced)
{
int r2 = splice(pipe_fd[0], NULL, peer_fd, NULL, vmspliced-spliced, SPLICE_F_MOVE);
if (r2 < 0)
{
if (errno != EAGAIN)
throw std::runtime_error(std::string("splice: ")+strerror(errno));
}
else
spliced += r2;
}
offset += vmspliced;
}
r = offset;
//r = write_blocking(peer_fd, &buf, op.sec_rw.len);
}
mm.free(buf, op.sec_rw.len);
buf = NULL;
if (r < op.sec_rw.len)
break;
}
@ -170,5 +205,6 @@ void run_stub(int peer_fd)
break;
}
}
free(buf);
close(pipe_fd[0]);
close(pipe_fd[1]);
}