Simplified distributed block storage with strong consistency, like in Ceph
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

568 lines
15 KiB

  1. // Copyright (c) Vitaliy Filippov, 2019+
  2. // License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
  3. // QEMU block driver
  4. #define BUILD_DSO
  5. #define _GNU_SOURCE
  6. #include "qemu/osdep.h"
  7. #include "block/block_int.h"
  8. #include "qapi/error.h"
  9. #include "qapi/qmp/qdict.h"
  10. #include "qapi/qmp/qerror.h"
  11. #include "qemu/uri.h"
  12. #include "qemu/error-report.h"
  13. #include "qemu/module.h"
  14. #include "qemu/option.h"
  15. #if QEMU_VERSION_MAJOR >= 3
  16. #include "qemu/units.h"
  17. #include "block/qdict.h"
  18. #include "qemu/cutils.h"
  19. #else
  20. #include "qapi/qmp/qint.h"
  21. #define qdict_put_int(options, name, num_val) qdict_put_obj(options, name, QOBJECT(qint_from_int(num_val)))
  22. #define qdict_put_str(options, name, value) qdict_put_obj(options, name, QOBJECT(qstring_from_str(value)))
  23. #define qobject_unref QDECREF
  24. #endif
  25. #include "qemu_proxy.h"
  26. void qemu_module_dummy(void)
  27. {
  28. }
  29. void DSO_STAMP_FUN(void)
  30. {
  31. }
  32. typedef struct VitastorClient
  33. {
  34. void *proxy;
  35. void *watch;
  36. char *config_path;
  37. char *etcd_host;
  38. char *etcd_prefix;
  39. char *image;
  40. uint64_t inode;
  41. uint64_t pool;
  42. uint64_t size;
  43. long readonly;
  44. char *rdma_device;
  45. int rdma_port_num;
  46. int rdma_gid_index;
  47. int rdma_mtu;
  48. QemuMutex mutex;
  49. } VitastorClient;
  50. typedef struct VitastorRPC
  51. {
  52. BlockDriverState *bs;
  53. Coroutine *co;
  54. QEMUIOVector *iov;
  55. long ret;
  56. int complete;
  57. } VitastorRPC;
  58. static void vitastor_co_init_task(BlockDriverState *bs, VitastorRPC *task);
  59. static void vitastor_co_generic_bh_cb(long retval, void *opaque);
  60. static void vitastor_close(BlockDriverState *bs);
  61. static char *qemu_rbd_next_tok(char *src, char delim, char **p)
  62. {
  63. char *end;
  64. *p = NULL;
  65. for (end = src; *end; ++end)
  66. {
  67. if (*end == delim)
  68. break;
  69. if (*end == '\\' && end[1] != '\0')
  70. end++;
  71. }
  72. if (*end == delim)
  73. {
  74. *p = end + 1;
  75. *end = '\0';
  76. }
  77. return src;
  78. }
  79. static void qemu_rbd_unescape(char *src)
  80. {
  81. char *p;
  82. for (p = src; *src; ++src, ++p)
  83. {
  84. if (*src == '\\' && src[1] != '\0')
  85. src++;
  86. *p = *src;
  87. }
  88. *p = '\0';
  89. }
  90. // vitastor[:key=value]*
  91. // vitastor[:etcd_host=127.0.0.1]:inode=1:pool=1[:rdma_gid_index=3]
  92. // vitastor:config_path=/etc/vitastor/vitastor.conf:image=testimg
  93. static void vitastor_parse_filename(const char *filename, QDict *options, Error **errp)
  94. {
  95. const char *start;
  96. char *p, *buf;
  97. if (!strstart(filename, "vitastor:", &start))
  98. {
  99. error_setg(errp, "File name must start with 'vitastor:'");
  100. return;
  101. }
  102. buf = g_strdup(start);
  103. p = buf;
  104. // The following are all key/value pairs
  105. while (p)
  106. {
  107. char *name, *value;
  108. name = qemu_rbd_next_tok(p, '=', &p);
  109. if (!p)
  110. {
  111. error_setg(errp, "conf option %s has no value", name);
  112. break;
  113. }
  114. qemu_rbd_unescape(name);
  115. value = qemu_rbd_next_tok(p, ':', &p);
  116. qemu_rbd_unescape(value);
  117. if (!strcmp(name, "inode") ||
  118. !strcmp(name, "pool") ||
  119. !strcmp(name, "size") ||
  120. !strcmp(name, "rdma_port_num") ||
  121. !strcmp(name, "rdma_gid_index") ||
  122. !strcmp(name, "rdma_mtu"))
  123. {
  124. unsigned long long num_val;
  125. if (parse_uint_full(value, &num_val, 0))
  126. {
  127. error_setg(errp, "Illegal %s: %s", name, value);
  128. goto out;
  129. }
  130. qdict_put_int(options, name, num_val);
  131. }
  132. else
  133. {
  134. qdict_put_str(options, name, value);
  135. }
  136. }
  137. if (!qdict_get_try_str(options, "image"))
  138. {
  139. if (!qdict_get_try_int(options, "inode", 0))
  140. {
  141. error_setg(errp, "one of image (name) and inode (number) must be specified");
  142. goto out;
  143. }
  144. if (!(qdict_get_try_int(options, "inode", 0) >> (64-POOL_ID_BITS)) &&
  145. !qdict_get_try_int(options, "pool", 0))
  146. {
  147. error_setg(errp, "pool number must be specified or included in the inode number");
  148. goto out;
  149. }
  150. if (!qdict_get_try_int(options, "size", 0))
  151. {
  152. error_setg(errp, "size must be specified when inode number is used instead of image name");
  153. goto out;
  154. }
  155. }
  156. out:
  157. g_free(buf);
  158. return;
  159. }
  160. static void coroutine_fn vitastor_co_get_metadata(VitastorRPC *task)
  161. {
  162. BlockDriverState *bs = task->bs;
  163. VitastorClient *client = bs->opaque;
  164. task->co = qemu_coroutine_self();
  165. qemu_mutex_lock(&client->mutex);
  166. vitastor_proxy_watch_metadata(client->proxy, client->image, vitastor_co_generic_bh_cb, task);
  167. qemu_mutex_unlock(&client->mutex);
  168. while (!task->complete)
  169. {
  170. qemu_coroutine_yield();
  171. }
  172. }
  173. static int vitastor_file_open(BlockDriverState *bs, QDict *options, int flags, Error **errp)
  174. {
  175. VitastorClient *client = bs->opaque;
  176. int64_t ret = 0;
  177. qemu_mutex_init(&client->mutex);
  178. client->config_path = g_strdup(qdict_get_try_str(options, "config_path"));
  179. client->etcd_host = g_strdup(qdict_get_try_str(options, "etcd_host"));
  180. client->etcd_prefix = g_strdup(qdict_get_try_str(options, "etcd_prefix"));
  181. client->rdma_device = g_strdup(qdict_get_try_str(options, "rdma_device"));
  182. client->rdma_port_num = qdict_get_try_int(options, "rdma_port_num", 0);
  183. client->rdma_gid_index = qdict_get_try_int(options, "rdma_gid_index", 0);
  184. client->rdma_mtu = qdict_get_try_int(options, "rdma_mtu", 0);
  185. client->proxy = vitastor_proxy_create(
  186. bdrv_get_aio_context(bs), client->config_path, client->etcd_host, client->etcd_prefix,
  187. client->rdma_device, client->rdma_port_num, client->rdma_gid_index, client->rdma_mtu
  188. );
  189. client->image = g_strdup(qdict_get_try_str(options, "image"));
  190. client->readonly = (flags & BDRV_O_RDWR) ? 1 : 0;
  191. if (client->image)
  192. {
  193. // Get image metadata (size and readonly flag)
  194. VitastorRPC task;
  195. task.complete = 0;
  196. task.bs = bs;
  197. if (qemu_in_coroutine())
  198. {
  199. vitastor_co_get_metadata(&task);
  200. }
  201. else
  202. {
  203. qemu_coroutine_enter(qemu_coroutine_create((void(*)(void*))vitastor_co_get_metadata, &task));
  204. }
  205. BDRV_POLL_WHILE(bs, !task.complete);
  206. client->watch = (void*)task.ret;
  207. client->readonly = client->readonly || vitastor_proxy_get_readonly(client->watch);
  208. client->size = vitastor_proxy_get_size(client->watch);
  209. if (!vitastor_proxy_get_inode_num(client->watch))
  210. {
  211. error_setg(errp, "image does not exist");
  212. vitastor_close(bs);
  213. }
  214. if (!client->size)
  215. {
  216. client->size = qdict_get_int(options, "size");
  217. }
  218. }
  219. else
  220. {
  221. client->watch = NULL;
  222. client->inode = qdict_get_int(options, "inode");
  223. client->pool = qdict_get_int(options, "pool");
  224. if (client->pool)
  225. {
  226. client->inode = (client->inode & ((1l << (64-POOL_ID_BITS)) - 1)) | (client->pool << (64-POOL_ID_BITS));
  227. }
  228. client->size = qdict_get_int(options, "size");
  229. }
  230. if (!client->size)
  231. {
  232. error_setg(errp, "image size not specified");
  233. vitastor_close(bs);
  234. return -1;
  235. }
  236. bs->total_sectors = client->size / BDRV_SECTOR_SIZE;
  237. //client->aio_context = bdrv_get_aio_context(bs);
  238. qdict_del(options, "rdma_mtu");
  239. qdict_del(options, "rdma_gid_index");
  240. qdict_del(options, "rdma_port_num");
  241. qdict_del(options, "rdma_device");
  242. qdict_del(options, "config_path");
  243. qdict_del(options, "etcd_host");
  244. qdict_del(options, "etcd_prefix");
  245. qdict_del(options, "image");
  246. qdict_del(options, "inode");
  247. qdict_del(options, "pool");
  248. qdict_del(options, "size");
  249. return ret;
  250. }
  251. static void vitastor_close(BlockDriverState *bs)
  252. {
  253. VitastorClient *client = bs->opaque;
  254. vitastor_proxy_destroy(client->proxy);
  255. qemu_mutex_destroy(&client->mutex);
  256. if (client->config_path)
  257. g_free(client->config_path);
  258. if (client->etcd_host)
  259. g_free(client->etcd_host);
  260. if (client->etcd_prefix)
  261. g_free(client->etcd_prefix);
  262. if (client->image)
  263. g_free(client->image);
  264. }
  265. #if QEMU_VERSION_MAJOR >= 3
  266. static int vitastor_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz)
  267. {
  268. bsz->phys = 4096;
  269. bsz->log = 4096;
  270. return 0;
  271. }
  272. #endif
  273. static int coroutine_fn vitastor_co_create_opts(
  274. #if QEMU_VERSION_MAJOR >= 4
  275. BlockDriver *drv,
  276. #endif
  277. const char *url, QemuOpts *opts, Error **errp)
  278. {
  279. QDict *options;
  280. int ret;
  281. options = qdict_new();
  282. vitastor_parse_filename(url, options, errp);
  283. if (*errp)
  284. {
  285. ret = -1;
  286. goto out;
  287. }
  288. // inodes don't require creation in Vitastor. FIXME: They will when there will be some metadata
  289. ret = 0;
  290. out:
  291. qobject_unref(options);
  292. return ret;
  293. }
  294. #if QEMU_VERSION_MAJOR >= 3
  295. static int coroutine_fn vitastor_co_truncate(BlockDriverState *bs, int64_t offset,
  296. #if QEMU_VERSION_MAJOR >= 4
  297. bool exact,
  298. #endif
  299. PreallocMode prealloc,
  300. #if QEMU_VERSION_MAJOR >= 5 && QEMU_VERSION_MINOR >= 1 || QEMU_VERSION_MAJOR > 5
  301. BdrvRequestFlags flags,
  302. #endif
  303. Error **errp)
  304. {
  305. VitastorClient *client = bs->opaque;
  306. if (prealloc != PREALLOC_MODE_OFF)
  307. {
  308. error_setg(errp, "Unsupported preallocation mode '%s'", PreallocMode_str(prealloc));
  309. return -ENOTSUP;
  310. }
  311. // TODO: Resize inode to <offset> bytes
  312. client->size = offset / BDRV_SECTOR_SIZE;
  313. return 0;
  314. }
  315. #endif
  316. static int vitastor_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
  317. {
  318. bdi->cluster_size = 4096;
  319. return 0;
  320. }
  321. static int64_t vitastor_getlength(BlockDriverState *bs)
  322. {
  323. VitastorClient *client = bs->opaque;
  324. return client->size;
  325. }
  326. #if QEMU_VERSION_MAJOR >= 3
  327. static void vitastor_refresh_limits(BlockDriverState *bs, Error **errp)
  328. #else
  329. static int vitastor_refresh_limits(BlockDriverState *bs)
  330. #endif
  331. {
  332. #if QEMU_VERSION_MAJOR >= 4
  333. bs->bl.request_alignment = 4096;
  334. bs->bl.min_mem_alignment = 4096;
  335. #else
  336. bs->request_alignment = 4096;
  337. #endif
  338. bs->bl.opt_mem_alignment = 4096;
  339. #if QEMU_VERSION_MAJOR < 3
  340. return 0;
  341. #endif
  342. }
  343. static int64_t vitastor_get_allocated_file_size(BlockDriverState *bs)
  344. {
  345. return 0;
  346. }
  347. static void vitastor_co_init_task(BlockDriverState *bs, VitastorRPC *task)
  348. {
  349. *task = (VitastorRPC) {
  350. .co = qemu_coroutine_self(),
  351. .bs = bs,
  352. };
  353. }
  354. static void vitastor_co_generic_bh_cb(long retval, void *opaque)
  355. {
  356. VitastorRPC *task = opaque;
  357. task->ret = retval;
  358. task->complete = 1;
  359. if (qemu_coroutine_self() != task->co)
  360. {
  361. #if QEMU_VERSION_MAJOR >= 3
  362. aio_co_wake(task->co);
  363. #else
  364. qemu_coroutine_enter(task->co, NULL);
  365. qemu_aio_release(task);
  366. #endif
  367. }
  368. }
  369. static int coroutine_fn vitastor_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes, QEMUIOVector *iov, int flags)
  370. {
  371. VitastorClient *client = bs->opaque;
  372. VitastorRPC task;
  373. vitastor_co_init_task(bs, &task);
  374. task.iov = iov;
  375. uint64_t inode = client->watch ? vitastor_proxy_get_inode_num(client->watch) : client->inode;
  376. qemu_mutex_lock(&client->mutex);
  377. vitastor_proxy_rw(0, client->proxy, inode, offset, bytes, iov->iov, iov->niov, vitastor_co_generic_bh_cb, &task);
  378. qemu_mutex_unlock(&client->mutex);
  379. while (!task.complete)
  380. {
  381. qemu_coroutine_yield();
  382. }
  383. return task.ret;
  384. }
  385. static int coroutine_fn vitastor_co_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t bytes, QEMUIOVector *iov, int flags)
  386. {
  387. VitastorClient *client = bs->opaque;
  388. VitastorRPC task;
  389. vitastor_co_init_task(bs, &task);
  390. task.iov = iov;
  391. uint64_t inode = client->watch ? vitastor_proxy_get_inode_num(client->watch) : client->inode;
  392. qemu_mutex_lock(&client->mutex);
  393. vitastor_proxy_rw(1, client->proxy, inode, offset, bytes, iov->iov, iov->niov, vitastor_co_generic_bh_cb, &task);
  394. qemu_mutex_unlock(&client->mutex);
  395. while (!task.complete)
  396. {
  397. qemu_coroutine_yield();
  398. }
  399. return task.ret;
  400. }
  401. #if QEMU_VERSION_MAJOR < 3
  402. static int coroutine_fn vitastor_co_readv(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *iov)
  403. {
  404. return vitastor_co_preadv(bs, sector_num*BDRV_SECTOR_SIZE, nb_sectors*BDRV_SECTOR_SIZE, iov, 0);
  405. }
  406. static int coroutine_fn vitastor_co_writev(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *iov)
  407. {
  408. return vitastor_co_pwritev(bs, sector_num*BDRV_SECTOR_SIZE, nb_sectors*BDRV_SECTOR_SIZE, iov, 0);
  409. }
  410. #endif
  411. static int coroutine_fn vitastor_co_flush(BlockDriverState *bs)
  412. {
  413. VitastorClient *client = bs->opaque;
  414. VitastorRPC task;
  415. vitastor_co_init_task(bs, &task);
  416. qemu_mutex_lock(&client->mutex);
  417. vitastor_proxy_sync(client->proxy, vitastor_co_generic_bh_cb, &task);
  418. qemu_mutex_unlock(&client->mutex);
  419. while (!task.complete)
  420. {
  421. qemu_coroutine_yield();
  422. }
  423. return task.ret;
  424. }
  425. #if QEMU_VERSION_MAJOR >= 3
  426. static QemuOptsList vitastor_create_opts = {
  427. .name = "vitastor-create-opts",
  428. .head = QTAILQ_HEAD_INITIALIZER(vitastor_create_opts.head),
  429. .desc = {
  430. {
  431. .name = BLOCK_OPT_SIZE,
  432. .type = QEMU_OPT_SIZE,
  433. .help = "Virtual disk size"
  434. },
  435. { /* end of list */ }
  436. }
  437. };
  438. #else
  439. static QEMUOptionParameter vitastor_create_opts[] = {
  440. {
  441. .name = BLOCK_OPT_SIZE,
  442. .type = OPT_SIZE,
  443. .help = "Virtual disk size"
  444. },
  445. { NULL }
  446. };
  447. #endif
  448. static const char *vitastor_strong_runtime_opts[] = {
  449. "inode",
  450. "pool",
  451. "config_path",
  452. "etcd_host",
  453. "etcd_prefix",
  454. NULL
  455. };
  456. static BlockDriver bdrv_vitastor = {
  457. .format_name = "vitastor",
  458. .protocol_name = "vitastor",
  459. .instance_size = sizeof(VitastorClient),
  460. .bdrv_parse_filename = vitastor_parse_filename,
  461. .bdrv_has_zero_init = bdrv_has_zero_init_1,
  462. .bdrv_get_info = vitastor_get_info,
  463. .bdrv_getlength = vitastor_getlength,
  464. #if QEMU_VERSION_MAJOR >= 3
  465. .bdrv_probe_blocksizes = vitastor_probe_blocksizes,
  466. #endif
  467. .bdrv_refresh_limits = vitastor_refresh_limits,
  468. // FIXME: Implement it along with per-inode statistics
  469. //.bdrv_get_allocated_file_size = vitastor_get_allocated_file_size,
  470. .bdrv_file_open = vitastor_file_open,
  471. .bdrv_close = vitastor_close,
  472. // Option list for the create operation
  473. #if QEMU_VERSION_MAJOR >= 3
  474. .create_opts = &vitastor_create_opts,
  475. #else
  476. .create_options = vitastor_create_opts,
  477. #endif
  478. // For qmp_blockdev_create(), used by the qemu monitor / QAPI
  479. // Requires patching QAPI IDL, thus unimplemented
  480. //.bdrv_co_create = vitastor_co_create,
  481. #if QEMU_VERSION_MAJOR >= 3
  482. // For bdrv_create(), used by qemu-img
  483. .bdrv_co_create_opts = vitastor_co_create_opts,
  484. .bdrv_co_truncate = vitastor_co_truncate,
  485. .bdrv_co_preadv = vitastor_co_preadv,
  486. .bdrv_co_pwritev = vitastor_co_pwritev,
  487. #else
  488. .bdrv_co_readv = vitastor_co_readv,
  489. .bdrv_co_writev = vitastor_co_writev,
  490. #endif
  491. .bdrv_co_flush_to_disk = vitastor_co_flush,
  492. #if QEMU_VERSION_MAJOR >= 4
  493. .strong_runtime_opts = vitastor_strong_runtime_opts,
  494. #endif
  495. };
  496. static void vitastor_block_init(void)
  497. {
  498. bdrv_register(&bdrv_vitastor);
  499. }
  500. block_init(vitastor_block_init);