vitastor/src/qemu_driver.c

1077 rindas
34 KiB
C

// Copyright (c) Vitaliy Filippov, 2019+
// License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
// QEMU block driver
#ifdef VITASTOR_SOURCE_TREE
#define BUILD_DSO
#define _GNU_SOURCE
#endif
#include "qemu/osdep.h"
#include "qemu/main-loop.h"
#if QEMU_VERSION_MAJOR >= 8
#include "block/block-io.h"
#endif
#include "block/block_int.h"
#include "qapi/error.h"
#include "qapi/qmp/qdict.h"
#include "qapi/qmp/qerror.h"
#include "qemu/uri.h"
#include "qemu/error-report.h"
#include "qemu/module.h"
#include "qemu/option.h"
#if QEMU_VERSION_MAJOR >= 3
#include "qemu/units.h"
#include "block/qdict.h"
#include "qemu/cutils.h"
#elif QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR >= 10
#include "qemu/cutils.h"
#include "qapi/qmp/qstring.h"
#include "qapi/qmp/qjson.h"
#else
#include "qapi/qmp/qint.h"
#define qdict_put_int(options, name, num_val) qdict_put_obj(options, name, QOBJECT(qint_from_int(num_val)))
#define qdict_put_str(options, name, value) qdict_put_obj(options, name, QOBJECT(qstring_from_str(value)))
#define qobject_unref QDECREF
#endif
#if QEMU_VERSION_MAJOR == 4 && QEMU_VERSION_MINOR >= 2 || QEMU_VERSION_MAJOR > 4
#include "sysemu/replay.h"
#else
#include "sysemu/sysemu.h"
#endif
#include "vitastor_c.h"
#ifdef VITASTOR_SOURCE_TREE
void qemu_module_dummy(void)
{
}
void DSO_STAMP_FUN(void)
{
}
#endif
typedef struct VitastorFdData VitastorFdData;
typedef struct VitastorClient
{
void *proxy;
int uring_eventfd;
void *watch;
char *config_path;
char *etcd_host;
char *etcd_prefix;
char *image;
int skip_parents;
uint64_t inode;
uint64_t pool;
uint64_t size;
long readonly;
int use_rdma;
char *rdma_device;
int rdma_port_num;
int rdma_gid_index;
int rdma_mtu;
QemuMutex mutex;
AioContext *ctx;
VitastorFdData **fds;
int fd_count, fd_alloc;
int bh_uring_scheduled;
uint64_t last_bitmap_inode, last_bitmap_offset, last_bitmap_len;
uint32_t last_bitmap_granularity;
uint8_t *last_bitmap;
} VitastorClient;
typedef struct VitastorFdData
{
VitastorClient *cli;
int fd;
IOHandler *fd_read, *fd_write;
void *opaque;
} VitastorFdData;
typedef struct VitastorRPC
{
BlockDriverState *bs;
Coroutine *co;
QEMUIOVector *iov;
long ret;
int complete;
uint64_t inode, offset, len;
uint32_t bitmap_granularity;
uint8_t *bitmap;
#if QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR < 8
QEMUBH *bh;
#endif
} VitastorRPC;
#if QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR < 8
typedef struct VitastorBH
{
VitastorClient *cli;
QEMUBH *bh;
} VitastorBH;
#endif
static void vitastor_co_init_task(BlockDriverState *bs, VitastorRPC *task);
static void vitastor_co_generic_cb(void *opaque, long retval);
static void vitastor_co_read_cb(void *opaque, long retval, uint64_t version);
static void vitastor_close(BlockDriverState *bs);
static char *qemu_vitastor_next_tok(char *src, char delim, char **p)
{
char *end;
*p = NULL;
for (end = src; *end; ++end)
{
if (*end == delim)
break;
if (*end == '\\' && end[1] != '\0')
end++;
}
if (*end == delim)
{
*p = end + 1;
*end = '\0';
}
return src;
}
static void qemu_vitastor_unescape(char *src)
{
char *p;
for (p = src; *src; ++src, ++p)
{
if (*src == '\\' && src[1] != '\0')
src++;
*p = *src;
}
*p = '\0';
}
// vitastor[:key=value]*
// vitastor[:etcd_host=127.0.0.1]:inode=1:pool=1[:rdma_gid_index=3]
// vitastor:config_path=/etc/vitastor/vitastor.conf:image=testimg
static void vitastor_parse_filename(const char *filename, QDict *options, Error **errp)
{
const char *start;
char *p, *buf;
if (!strstart(filename, "vitastor:", &start))
{
error_setg(errp, "File name must start with 'vitastor:'");
return;
}
buf = g_strdup(start);
p = buf;
// The following are all key/value pairs
while (p)
{
int i;
char *name, *value;
name = qemu_vitastor_next_tok(p, '=', &p);
if (!p)
{
error_setg(errp, "conf option %s has no value", name);
break;
}
for (i = 0; i < strlen(name); i++)
if (name[i] == '_')
name[i] = '-';
qemu_vitastor_unescape(name);
value = qemu_vitastor_next_tok(p, ':', &p);
qemu_vitastor_unescape(value);
if (!strcmp(name, "inode") ||
!strcmp(name, "pool") ||
!strcmp(name, "size") ||
!strcmp(name, "skip-parents") ||
!strcmp(name, "use-rdma") ||
!strcmp(name, "rdma-port_num") ||
!strcmp(name, "rdma-gid-index") ||
!strcmp(name, "rdma-mtu"))
{
#if QEMU_VERSION_MAJOR < 8 || QEMU_VERSION_MAJOR == 8 && QEMU_VERSION_MINOR < 1
unsigned long long num_val;
if (parse_uint_full(value, &num_val, 0))
#else
uint64_t num_val;
if (parse_uint_full(value, 0, &num_val))
#endif
{
error_setg(errp, "Illegal %s: %s", name, value);
goto out;
}
qdict_put_int(options, name, num_val);
}
else
{
qdict_put_str(options, name, value);
}
}
if (!qdict_get_try_str(options, "image"))
{
if (!qdict_get_try_int(options, "inode", 0))
{
error_setg(errp, "one of image (name) and inode (number) must be specified");
goto out;
}
if (!(qdict_get_try_int(options, "inode", 0) >> (64-POOL_ID_BITS)) &&
!qdict_get_try_int(options, "pool", 0))
{
error_setg(errp, "pool number must be specified or included in the inode number");
goto out;
}
if (!qdict_get_try_int(options, "size", 0))
{
error_setg(errp, "size must be specified when inode number is used instead of image name");
goto out;
}
}
out:
g_free(buf);
return;
}
#if defined VITASTOR_C_API_VERSION && VITASTOR_C_API_VERSION >= 2
static void vitastor_uring_handler(void *opaque)
{
VitastorClient *client = (VitastorClient*)opaque;
qemu_mutex_lock(&client->mutex);
client->bh_uring_scheduled = 0;
vitastor_c_uring_handle_events(client->proxy);
qemu_mutex_unlock(&client->mutex);
}
#if QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR < 8
static void vitastor_bh_uring_handler(void *opaque)
{
VitastorBH *vbh = opaque;
vitastor_bh_handler(vbh->cli);
qemu_bh_delete(vbh->bh);
free(vbh);
}
#endif
static void vitastor_schedule_uring_handler(VitastorClient *client)
{
void *opaque = client;
if (client->uring_eventfd >= 0 && !client->bh_uring_scheduled)
{
client->bh_uring_scheduled = 1;
#if QEMU_VERSION_MAJOR > 4 || QEMU_VERSION_MAJOR == 4 && QEMU_VERSION_MINOR >= 2
replay_bh_schedule_oneshot_event(client->ctx, vitastor_uring_handler, opaque);
#elif QEMU_VERSION_MAJOR >= 3 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR >= 8
aio_bh_schedule_oneshot(client->ctx, vitastor_uring_handler, opaque);
#else
VitastorBH *vbh = (VitastorBH*)malloc(sizeof(VitastorBH));
vbh->cli = client;
#if QEMU_VERSION_MAJOR >= 2
vbh->bh = aio_bh_new(bdrv_get_aio_context(task->bs), vitastor_bh_uring_handler, vbh);
#else
vbh->bh = qemu_bh_new(vitastor_bh_uring_handler, vbh);
#endif
qemu_bh_schedule(vbh->bh);
#endif
}
}
#else
static void vitastor_schedule_uring_handler(VitastorClient *client)
{
}
#endif
static void coroutine_fn vitastor_co_get_metadata(VitastorRPC *task)
{
BlockDriverState *bs = task->bs;
VitastorClient *client = bs->opaque;
task->co = qemu_coroutine_self();
qemu_mutex_lock(&client->mutex);
vitastor_c_watch_inode(client->proxy, client->image, vitastor_co_generic_cb, task);
vitastor_schedule_uring_handler(client);
qemu_mutex_unlock(&client->mutex);
while (!task->complete)
{
qemu_coroutine_yield();
}
}
static void vitastor_aio_fd_read(void *fddv)
{
VitastorFdData *fdd = (VitastorFdData*)fddv;
qemu_mutex_lock(&fdd->cli->mutex);
fdd->fd_read(fdd->opaque);
vitastor_schedule_uring_handler(fdd->cli);
qemu_mutex_unlock(&fdd->cli->mutex);
}
static void vitastor_aio_fd_write(void *fddv)
{
VitastorFdData *fdd = (VitastorFdData*)fddv;
qemu_mutex_lock(&fdd->cli->mutex);
fdd->fd_write(fdd->opaque);
vitastor_schedule_uring_handler(fdd->cli);
qemu_mutex_unlock(&fdd->cli->mutex);
}
static void universal_aio_set_fd_handler(AioContext *ctx, int fd, IOHandler *fd_read, IOHandler *fd_write, void *opaque)
{
aio_set_fd_handler(ctx, fd,
#if QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR >= 5 || QEMU_VERSION_MAJOR >= 3 && (QEMU_VERSION_MAJOR < 8 || QEMU_VERSION_MAJOR == 8 && QEMU_VERSION_MINOR < 1)
0 /*is_external*/,
#endif
fd_read,
fd_write,
#if QEMU_VERSION_MAJOR == 1 && QEMU_VERSION_MINOR <= 6 || QEMU_VERSION_MAJOR < 1
NULL /*io_flush*/,
#endif
#if QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR >= 9 || QEMU_VERSION_MAJOR >= 3
NULL /*io_poll*/,
#endif
#if QEMU_VERSION_MAJOR >= 7
NULL /*io_poll_ready*/,
#endif
opaque);
}
static void vitastor_aio_set_fd_handler(void *vcli, int fd, int unused1, IOHandler *fd_read, IOHandler *fd_write, void *unused2, void *opaque)
{
VitastorClient *client = (VitastorClient*)vcli;
VitastorFdData *fdd = NULL;
int i;
for (i = 0; i < client->fd_count; i++)
{
if (client->fds[i]->fd == fd)
{
if (fd_read || fd_write)
{
fdd = client->fds[i];
fdd->opaque = opaque;
fdd->fd_read = fd_read;
fdd->fd_write = fd_write;
}
else
{
for (int j = i+1; j < client->fd_count; j++)
client->fds[j-1] = client->fds[j];
client->fd_count--;
}
break;
}
}
if ((fd_read || fd_write) && !fdd)
{
fdd = (VitastorFdData*)malloc(sizeof(VitastorFdData));
fdd->cli = client;
fdd->fd = fd;
fdd->fd_read = fd_read;
fdd->fd_write = fd_write;
fdd->opaque = opaque;
if (client->fd_count >= client->fd_alloc)
{
client->fd_alloc = client->fd_alloc*2;
if (client->fd_alloc < 16)
client->fd_alloc = 16;
client->fds = (VitastorFdData**)realloc(client->fds, sizeof(VitastorFdData*) * client->fd_alloc);
}
client->fds[client->fd_count++] = fdd;
}
universal_aio_set_fd_handler(
client->ctx, fd, fd_read ? vitastor_aio_fd_read : NULL, fd_write ? vitastor_aio_fd_write : NULL, fdd
);
}
#if defined VITASTOR_C_API_VERSION && VITASTOR_C_API_VERSION >= 2
typedef struct str_array
{
const char **items;
int len, alloc;
} str_array;
static void strarray_push(str_array *a, const char *str)
{
if (a->len >= a->alloc)
{
a->alloc = !a->alloc ? 4 : 2*a->alloc;
a->items = (const char**)realloc(a->items, a->alloc*sizeof(char*));
if (!a->items)
{
fprintf(stderr, "bad alloc\n");
abort();
}
}
a->items[a->len++] = str;
}
static void strarray_push_kv(str_array *a, const char *key, const char *value)
{
if (key && value)
{
strarray_push(a, key);
strarray_push(a, value);
}
}
static void strarray_free(str_array *a)
{
free(a->items);
a->items = NULL;
a->len = a->alloc = 0;
}
#endif
static int vitastor_file_open(BlockDriverState *bs, QDict *options, int flags, Error **errp)
{
VitastorRPC task;
VitastorClient *client = bs->opaque;
void *image = NULL;
int64_t ret = 0;
qemu_mutex_init(&client->mutex);
client->config_path = g_strdup(qdict_get_try_str(options, "config-path"));
// FIXME: Rename to etcd_address
client->etcd_host = g_strdup(qdict_get_try_str(options, "etcd-host"));
client->etcd_prefix = g_strdup(qdict_get_try_str(options, "etcd-prefix"));
client->skip_parents = qdict_get_try_int(options, "skip-parents", 0);
client->use_rdma = qdict_get_try_int(options, "use-rdma", -1);
client->rdma_device = g_strdup(qdict_get_try_str(options, "rdma-device"));
client->rdma_port_num = qdict_get_try_int(options, "rdma-port-num", 0);
client->rdma_gid_index = qdict_get_try_int(options, "rdma-gid-index", 0);
client->rdma_mtu = qdict_get_try_int(options, "rdma-mtu", 0);
client->ctx = bdrv_get_aio_context(bs);
#if defined VITASTOR_C_API_VERSION && VITASTOR_C_API_VERSION >= 2
str_array opt = {};
strarray_push_kv(&opt, "config_path", qdict_get_try_str(options, "config-path"));
strarray_push_kv(&opt, "etcd_address", qdict_get_try_str(options, "etcd-host"));
strarray_push_kv(&opt, "etcd_prefix", qdict_get_try_str(options, "etcd-prefix"));
strarray_push_kv(&opt, "use_rdma", qdict_get_try_str(options, "use-rdma"));
strarray_push_kv(&opt, "rdma_device", qdict_get_try_str(options, "rdma-device"));
strarray_push_kv(&opt, "rdma_port_num", qdict_get_try_str(options, "rdma-port-num"));
strarray_push_kv(&opt, "rdma_gid_index", qdict_get_try_str(options, "rdma-gid-index"));
strarray_push_kv(&opt, "rdma_mtu", qdict_get_try_str(options, "rdma-mtu"));
strarray_push_kv(&opt, "client_writeback_allowed", (flags & BDRV_O_NOCACHE) ? "0" : "1");
client->proxy = vitastor_c_create_uring_json(opt.items, opt.len);
strarray_free(&opt);
if (client->proxy)
{
client->uring_eventfd = vitastor_c_uring_register_eventfd(client->proxy);
if (client->uring_eventfd < 0)
{
fprintf(stderr, "vitastor: failed to create io_uring eventfd: %s\n", strerror(errno));
error_setg(errp, "failed to create io_uring eventfd");
vitastor_close(bs);
return -1;
}
universal_aio_set_fd_handler(client->ctx, client->uring_eventfd, vitastor_uring_handler, NULL, client);
}
else
{
// Writeback cache is unusable without io_uring because the client can't correctly flush on exit
fprintf(stderr, "vitastor: failed to create io_uring: %s - I/O will be slower%s\n",
strerror(errno), (flags & BDRV_O_NOCACHE ? "" : " and writeback cache will be disabled"));
#endif
client->uring_eventfd = -1;
client->proxy = vitastor_c_create_qemu(
vitastor_aio_set_fd_handler, client, client->config_path, client->etcd_host, client->etcd_prefix,
client->use_rdma, client->rdma_device, client->rdma_port_num, client->rdma_gid_index, client->rdma_mtu, 0
);
#if defined VITASTOR_C_API_VERSION && VITASTOR_C_API_VERSION >= 2
}
#endif
image = client->image = g_strdup(qdict_get_try_str(options, "image"));
client->readonly = (flags & BDRV_O_RDWR) ? 1 : 0;
// Get image metadata (size and readonly flag) or just wait until the client is ready
if (!image)
client->image = (char*)"x";
task.complete = 0;
task.bs = bs;
if (qemu_in_coroutine())
{
vitastor_co_get_metadata(&task);
}
else
{
#if QEMU_VERSION_MAJOR >= 8
aio_co_enter(bdrv_get_aio_context(bs), qemu_coroutine_create((void(*)(void*))vitastor_co_get_metadata, &task));
#elif QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR >= 9 || QEMU_VERSION_MAJOR >= 3
bdrv_coroutine_enter(bs, qemu_coroutine_create((void(*)(void*))vitastor_co_get_metadata, &task));
#else
qemu_coroutine_enter(qemu_coroutine_create((void(*)(void*))vitastor_co_get_metadata, &task));
#endif
BDRV_POLL_WHILE(bs, !task.complete);
}
client->image = image;
if (client->image)
{
client->watch = (void*)task.ret;
client->readonly = client->readonly || vitastor_c_inode_get_readonly(client->watch);
client->size = vitastor_c_inode_get_size(client->watch);
if (!vitastor_c_inode_get_num(client->watch))
{
error_setg(errp, "image does not exist");
vitastor_close(bs);
return -1;
}
if (!client->size)
{
client->size = qdict_get_try_int(options, "size", 0);
}
}
else
{
client->watch = NULL;
client->inode = qdict_get_try_int(options, "inode", 0);
client->pool = qdict_get_try_int(options, "pool", 0);
if (client->pool)
{
client->inode = (client->inode & (((uint64_t)1 << (64-POOL_ID_BITS)) - 1)) | (client->pool << (64-POOL_ID_BITS));
}
client->size = qdict_get_try_int(options, "size", 0);
vitastor_c_close_watch(client->proxy, (void*)task.ret);
}
if (!client->size)
{
error_setg(errp, "image size not specified");
vitastor_close(bs);
return -1;
}
bs->total_sectors = client->size / BDRV_SECTOR_SIZE;
#if QEMU_VERSION_MAJOR > 5 || QEMU_VERSION_MAJOR == 5 && QEMU_VERSION_MINOR >= 1
/* When extending regular files, we get zeros from the OS */
bs->supported_truncate_flags = BDRV_REQ_ZERO_WRITE;
#endif
//client->aio_context = bdrv_get_aio_context(bs);
qdict_del(options, "use-rdma");
qdict_del(options, "rdma-mtu");
qdict_del(options, "rdma-gid-index");
qdict_del(options, "rdma-port-num");
qdict_del(options, "rdma-device");
qdict_del(options, "config-path");
qdict_del(options, "etcd-host");
qdict_del(options, "etcd-prefix");
qdict_del(options, "image");
qdict_del(options, "inode");
qdict_del(options, "pool");
qdict_del(options, "size");
qdict_del(options, "skip-parents");
return ret;
}
static void vitastor_close(BlockDriverState *bs)
{
VitastorClient *client = bs->opaque;
vitastor_c_destroy(client->proxy);
if (client->fds)
{
free(client->fds);
client->fds = NULL;
client->fd_alloc = client->fd_count = 0;
}
qemu_mutex_destroy(&client->mutex);
if (client->config_path)
g_free(client->config_path);
if (client->etcd_host)
g_free(client->etcd_host);
if (client->etcd_prefix)
g_free(client->etcd_prefix);
if (client->image)
g_free(client->image);
free(client->last_bitmap);
client->last_bitmap = NULL;
}
#if QEMU_VERSION_MAJOR >= 3 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR > 2
static int vitastor_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz)
{
bsz->phys = 4096;
bsz->log = 512;
return 0;
}
#endif
#if QEMU_VERSION_MAJOR >= 3 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR >= 12
static int coroutine_fn vitastor_co_create_opts(
#if QEMU_VERSION_MAJOR >= 4
BlockDriver *drv,
#endif
const char *url, QemuOpts *opts, Error **errp)
{
QDict *options;
int ret;
options = qdict_new();
vitastor_parse_filename(url, options, errp);
if (*errp)
{
ret = -1;
goto out;
}
// inodes don't require creation in Vitastor. FIXME: They will when there will be some metadata
ret = 0;
out:
qobject_unref(options);
return ret;
}
#endif
#if QEMU_VERSION_MAJOR >= 3
static int coroutine_fn vitastor_co_truncate(BlockDriverState *bs, int64_t offset,
#if QEMU_VERSION_MAJOR >= 4
bool exact,
#endif
PreallocMode prealloc,
#if QEMU_VERSION_MAJOR >= 5 && QEMU_VERSION_MINOR >= 1 || QEMU_VERSION_MAJOR > 5 || defined RHEL_BDRV_CO_TRUNCATE_FLAGS
BdrvRequestFlags flags,
#endif
Error **errp)
{
VitastorClient *client = bs->opaque;
if (prealloc != PREALLOC_MODE_OFF)
{
error_setg(errp, "Unsupported preallocation mode '%s'", PreallocMode_str(prealloc));
return -ENOTSUP;
}
// TODO: Resize inode to <offset> bytes
#if QEMU_VERSION_MAJOR >= 4
client->size = exact || client->size < offset ? offset : client->size;
#else
client->size = offset;
#endif
return 0;
}
#endif
static int vitastor_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
{
bdi->cluster_size = 4096;
return 0;
}
static int64_t vitastor_getlength(BlockDriverState *bs)
{
VitastorClient *client = bs->opaque;
return client->size;
}
#if QEMU_VERSION_MAJOR >= 3 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR > 0
static void vitastor_refresh_limits(BlockDriverState *bs, Error **errp)
#else
static int vitastor_refresh_limits(BlockDriverState *bs)
#endif
{
bs->bl.request_alignment = 4096;
#if QEMU_VERSION_MAJOR >= 3 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR > 3
bs->bl.min_mem_alignment = 4096;
#endif
bs->bl.opt_mem_alignment = 4096;
#if QEMU_VERSION_MAJOR < 2 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR == 0
return 0;
#endif
}
//static int64_t vitastor_get_allocated_file_size(BlockDriverState *bs)
//{
// return 0;
//}
static void vitastor_co_init_task(BlockDriverState *bs, VitastorRPC *task)
{
*task = (VitastorRPC) {
.co = qemu_coroutine_self(),
.bs = bs,
};
}
static void vitastor_co_generic_bh_cb(void *opaque)
{
VitastorRPC *task = opaque;
task->complete = 1;
if (qemu_coroutine_self() != task->co)
{
#if QEMU_VERSION_MAJOR >= 3 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR > 8
aio_co_wake(task->co);
#else
#if QEMU_VERSION_MAJOR == 2
qemu_bh_delete(task->bh);
#endif
qemu_coroutine_enter(task->co, NULL);
qemu_aio_release(task);
#endif
}
}
static void vitastor_co_generic_cb(void *opaque, long retval)
{
VitastorRPC *task = opaque;
task->ret = retval;
#if QEMU_VERSION_MAJOR > 4 || QEMU_VERSION_MAJOR == 4 && QEMU_VERSION_MINOR >= 2
replay_bh_schedule_oneshot_event(bdrv_get_aio_context(task->bs), vitastor_co_generic_bh_cb, opaque);
#elif QEMU_VERSION_MAJOR >= 3 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR >= 8
aio_bh_schedule_oneshot(bdrv_get_aio_context(task->bs), vitastor_co_generic_bh_cb, opaque);
#elif QEMU_VERSION_MAJOR >= 2
task->bh = aio_bh_new(bdrv_get_aio_context(task->bs), vitastor_co_generic_bh_cb, opaque);
qemu_bh_schedule(task->bh);
#else
task->bh = qemu_bh_new(vitastor_co_generic_bh_cb, opaque);
qemu_bh_schedule(task->bh);
#endif
}
static void vitastor_co_read_cb(void *opaque, long retval, uint64_t version)
{
vitastor_co_generic_cb(opaque, retval);
}
static int coroutine_fn vitastor_co_preadv(BlockDriverState *bs,
#if QEMU_VERSION_MAJOR >= 7 || QEMU_VERSION_MAJOR == 6 && QEMU_VERSION_MINOR >= 2
int64_t offset, int64_t bytes, QEMUIOVector *iov, BdrvRequestFlags flags
#else
uint64_t offset, uint64_t bytes, QEMUIOVector *iov, int flags
#endif
)
{
VitastorClient *client = bs->opaque;
VitastorRPC task;
vitastor_co_init_task(bs, &task);
task.iov = iov;
uint64_t inode = client->watch ? vitastor_c_inode_get_num(client->watch) : client->inode;
qemu_mutex_lock(&client->mutex);
vitastor_c_read(client->proxy, inode, offset, bytes, iov->iov, iov->niov, vitastor_co_read_cb, &task);
vitastor_schedule_uring_handler(client);
qemu_mutex_unlock(&client->mutex);
while (!task.complete)
{
qemu_coroutine_yield();
}
return task.ret;
}
static int coroutine_fn vitastor_co_pwritev(BlockDriverState *bs,
#if QEMU_VERSION_MAJOR >= 7 || QEMU_VERSION_MAJOR == 6 && QEMU_VERSION_MINOR >= 2
int64_t offset, int64_t bytes, QEMUIOVector *iov, BdrvRequestFlags flags
#else
uint64_t offset, uint64_t bytes, QEMUIOVector *iov, int flags
#endif
)
{
VitastorClient *client = bs->opaque;
VitastorRPC task;
vitastor_co_init_task(bs, &task);
task.iov = iov;
if (client->last_bitmap)
{
// Invalidate last bitmap on write
free(client->last_bitmap);
client->last_bitmap = NULL;
}
uint64_t inode = client->watch ? vitastor_c_inode_get_num(client->watch) : client->inode;
qemu_mutex_lock(&client->mutex);
vitastor_c_write(client->proxy, inode, offset, bytes, 0, iov->iov, iov->niov, vitastor_co_generic_cb, &task);
vitastor_schedule_uring_handler(client);
qemu_mutex_unlock(&client->mutex);
while (!task.complete)
{
qemu_coroutine_yield();
}
return task.ret;
}
#if defined VITASTOR_C_API_VERSION && VITASTOR_C_API_VERSION >= 1
#if QEMU_VERSION_MAJOR >= 2 || QEMU_VERSION_MAJOR == 1 && QEMU_VERSION_MINOR >= 7
static void vitastor_co_read_bitmap_cb(void *opaque, long retval, uint8_t *bitmap)
{
VitastorRPC *task = opaque;
VitastorClient *client = task->bs->opaque;
task->ret = retval;
if (retval >= 0)
{
task->bitmap = bitmap;
if (client->last_bitmap_inode == task->inode &&
client->last_bitmap_offset == task->offset &&
client->last_bitmap_len == task->len)
{
free(client->last_bitmap);
client->last_bitmap = bitmap;
}
}
#if QEMU_VERSION_MAJOR > 4 || QEMU_VERSION_MAJOR == 4 && QEMU_VERSION_MINOR >= 2
replay_bh_schedule_oneshot_event(bdrv_get_aio_context(task->bs), vitastor_co_generic_bh_cb, opaque);
#elif QEMU_VERSION_MAJOR >= 3 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR >= 8
aio_bh_schedule_oneshot(bdrv_get_aio_context(task->bs), vitastor_co_generic_bh_cb, opaque);
#elif QEMU_VERSION_MAJOR >= 2
task->bh = aio_bh_new(bdrv_get_aio_context(task->bs), vitastor_co_generic_bh_cb, opaque);
qemu_bh_schedule(task->bh);
#else
task->bh = qemu_bh_new(vitastor_co_generic_bh_cb, opaque);
qemu_bh_schedule(task->bh);
#endif
}
static int coroutine_fn vitastor_co_block_status(
BlockDriverState *bs, bool want_zero, int64_t offset, int64_t bytes,
int64_t *pnum, int64_t *map, BlockDriverState **file)
{
// Allocated => return BDRV_BLOCK_DATA|BDRV_BLOCK_OFFSET_VALID
// Not allocated => return 0
// Error => return -errno
// Set pnum to length of the extent, `*map` = `offset`, `*file` = `bs`
VitastorRPC task;
VitastorClient *client = bs->opaque;
uint64_t inode = client->watch ? vitastor_c_inode_get_num(client->watch) : client->inode;
uint8_t bit = 0;
if (client->last_bitmap && client->last_bitmap_inode == inode &&
client->last_bitmap_offset <= offset &&
client->last_bitmap_offset+client->last_bitmap_len >= (want_zero ? offset+1 : offset+bytes))
{
// Use the previously read bitmap
task.bitmap_granularity = client->last_bitmap_granularity;
task.offset = client->last_bitmap_offset;
task.len = client->last_bitmap_len;
task.bitmap = client->last_bitmap;
}
else
{
// Read bitmap from this position, rounding to full inode PG blocks
uint32_t block_size = vitastor_c_inode_get_block_size(client->proxy, inode);
if (!block_size)
return -EAGAIN;
// Init coroutine
vitastor_co_init_task(bs, &task);
free(client->last_bitmap);
task.inode = client->last_bitmap_inode = inode;
task.bitmap_granularity = client->last_bitmap_granularity = vitastor_c_inode_get_bitmap_granularity(client->proxy, inode);
task.offset = client->last_bitmap_offset = offset / block_size * block_size;
task.len = client->last_bitmap_len = (offset+bytes+block_size-1) / block_size * block_size - task.offset;
task.bitmap = client->last_bitmap = NULL;
qemu_mutex_lock(&client->mutex);
vitastor_c_read_bitmap(client->proxy, task.inode, task.offset, task.len, !client->skip_parents, vitastor_co_read_bitmap_cb, &task);
vitastor_schedule_uring_handler(client);
qemu_mutex_unlock(&client->mutex);
while (!task.complete)
{
qemu_coroutine_yield();
}
if (task.ret < 0)
{
// Error
return task.ret;
}
}
if (want_zero)
{
// Get precise mapping with all holes
uint64_t bmp_pos = (offset-task.offset) / task.bitmap_granularity;
uint64_t bmp_len = task.len / task.bitmap_granularity;
uint64_t bmp_end = bmp_pos+1;
bit = (task.bitmap[bmp_pos >> 3] >> (bmp_pos & 0x7)) & 1;
while (bmp_end < bmp_len && ((task.bitmap[bmp_end >> 3] >> (bmp_end & 0x7)) & 1) == bit)
{
bmp_end++;
}
*pnum = (bmp_end-bmp_pos) * task.bitmap_granularity;
}
else
{
// Get larger allocated extents, possibly with false positives
uint64_t bmp_pos = (offset-task.offset) / task.bitmap_granularity;
uint64_t bmp_end = (offset+bytes-task.offset) / task.bitmap_granularity - bmp_pos;
while (bmp_pos < bmp_end)
{
if (!(bmp_pos & 7) && bmp_end >= bmp_pos+8)
{
bit = bit || task.bitmap[bmp_pos >> 3];
bmp_pos += 8;
}
else
{
bit = bit || ((task.bitmap[bmp_pos >> 3] >> (bmp_pos & 0x7)) & 1);
bmp_pos++;
}
}
*pnum = bytes;
}
if (bit)
{
*map = offset;
*file = bs;
}
return (bit ? (BDRV_BLOCK_DATA|BDRV_BLOCK_OFFSET_VALID) : 0);
}
#endif
#if QEMU_VERSION_MAJOR == 1 && QEMU_VERSION_MINOR >= 7 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR < 12
// QEMU 1.7-2.11
static int64_t coroutine_fn vitastor_co_get_block_status(BlockDriverState *bs,
int64_t sector_num, int nb_sectors, int *pnum, BlockDriverState **file)
{
int64_t map = 0;
int64_t pnumbytes = 0;
int r = vitastor_co_block_status(bs, 1, sector_num*BDRV_SECTOR_SIZE, nb_sectors*BDRV_SECTOR_SIZE, &pnumbytes, &map, &file);
*pnum = pnumbytes/BDRV_SECTOR_SIZE;
return r;
}
#endif
#endif
#if !( QEMU_VERSION_MAJOR >= 3 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR >= 7 )
static int coroutine_fn vitastor_co_readv(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *iov)
{
return vitastor_co_preadv(bs, sector_num*BDRV_SECTOR_SIZE, nb_sectors*BDRV_SECTOR_SIZE, iov, 0);
}
static int coroutine_fn vitastor_co_writev(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *iov)
{
return vitastor_co_pwritev(bs, sector_num*BDRV_SECTOR_SIZE, nb_sectors*BDRV_SECTOR_SIZE, iov, 0);
}
#endif
static int coroutine_fn vitastor_co_flush(BlockDriverState *bs)
{
VitastorClient *client = bs->opaque;
VitastorRPC task;
vitastor_co_init_task(bs, &task);
qemu_mutex_lock(&client->mutex);
vitastor_c_sync(client->proxy, vitastor_co_generic_cb, &task);
vitastor_schedule_uring_handler(client);
qemu_mutex_unlock(&client->mutex);
while (!task.complete)
{
qemu_coroutine_yield();
}
return task.ret;
}
#if QEMU_VERSION_MAJOR >= 3 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR > 0
static QemuOptsList vitastor_create_opts = {
.name = "vitastor-create-opts",
.head = QTAILQ_HEAD_INITIALIZER(vitastor_create_opts.head),
.desc = {
{
.name = BLOCK_OPT_SIZE,
.type = QEMU_OPT_SIZE,
.help = "Virtual disk size"
},
{ /* end of list */ }
}
};
#else
static QEMUOptionParameter vitastor_create_opts[] = {
{
.name = BLOCK_OPT_SIZE,
.type = OPT_SIZE,
.help = "Virtual disk size"
},
{ NULL }
};
#endif
#if QEMU_VERSION_MAJOR >= 4
static const char *vitastor_strong_runtime_opts[] = {
"inode",
"pool",
"config-path",
"etcd-host",
"etcd-prefix",
NULL
};
#endif
static BlockDriver bdrv_vitastor = {
.format_name = "vitastor",
.protocol_name = "vitastor",
.instance_size = sizeof(VitastorClient),
.bdrv_parse_filename = vitastor_parse_filename,
.bdrv_has_zero_init = bdrv_has_zero_init_1,
#if QEMU_VERSION_MAJOR >= 8
.bdrv_co_get_info = vitastor_get_info,
.bdrv_co_getlength = vitastor_getlength,
#else
.bdrv_get_info = vitastor_get_info,
.bdrv_getlength = vitastor_getlength,
#endif
#if QEMU_VERSION_MAJOR >= 3 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR > 2
.bdrv_probe_blocksizes = vitastor_probe_blocksizes,
#endif
.bdrv_refresh_limits = vitastor_refresh_limits,
// FIXME: Implement it along with per-inode statistics
//.bdrv_get_allocated_file_size = vitastor_get_allocated_file_size,
.bdrv_file_open = vitastor_file_open,
.bdrv_close = vitastor_close,
// Option list for the create operation
#if QEMU_VERSION_MAJOR >= 3 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR > 0
.create_opts = &vitastor_create_opts,
#else
.create_options = vitastor_create_opts,
#endif
// For qmp_blockdev_create(), used by the qemu monitor / QAPI
// Requires patching QAPI IDL, thus unimplemented
//.bdrv_co_create = vitastor_co_create,
#if QEMU_VERSION_MAJOR >= 3 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR >= 12
// For bdrv_create(), used by qemu-img
.bdrv_co_create_opts = vitastor_co_create_opts,
#endif
#if QEMU_VERSION_MAJOR >= 3
.bdrv_co_truncate = vitastor_co_truncate,
#endif
#if defined VITASTOR_C_API_VERSION && VITASTOR_C_API_VERSION >= 1
#if QEMU_VERSION_MAJOR >= 3 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR >= 12
// For snapshot export
.bdrv_co_block_status = vitastor_co_block_status,
#elif QEMU_VERSION_MAJOR == 1 && QEMU_VERSION_MINOR >= 7 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR < 12
.bdrv_co_get_block_status = vitastor_co_get_block_status,
#endif
#endif
#if QEMU_VERSION_MAJOR >= 3 || QEMU_VERSION_MAJOR == 2 && QEMU_VERSION_MINOR >= 7
.bdrv_co_preadv = vitastor_co_preadv,
.bdrv_co_pwritev = vitastor_co_pwritev,
#else
.bdrv_co_readv = vitastor_co_readv,
.bdrv_co_writev = vitastor_co_writev,
#endif
.bdrv_co_flush_to_disk = vitastor_co_flush,
#if QEMU_VERSION_MAJOR >= 4
.strong_runtime_opts = vitastor_strong_runtime_opts,
#endif
};
static void vitastor_block_init(void)
{
bdrv_register(&bdrv_vitastor);
}
block_init(vitastor_block_init);