Simplified distributed block storage with strong consistency, like in Ceph
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

474 lines
12 KiB

  1. // Copyright (c) Vitaliy Filippov, 2019+
  2. // License: VNPL-1.0 or GNU GPL-2.0+ (see README.md for details)
  3. // QEMU block driver
  4. #define BUILD_DSO
  5. #define _GNU_SOURCE
  6. #include "qemu/osdep.h"
  7. #include "block/block_int.h"
  8. #include "qapi/error.h"
  9. #include "qapi/qmp/qdict.h"
  10. #include "qapi/qmp/qerror.h"
  11. #include "qemu/uri.h"
  12. #include "qemu/error-report.h"
  13. #include "qemu/module.h"
  14. #include "qemu/option.h"
  15. #if QEMU_VERSION_MAJOR >= 3
  16. #include "qemu/units.h"
  17. #include "block/qdict.h"
  18. #include "qemu/cutils.h"
  19. #else
  20. #include "qapi/qmp/qint.h"
  21. #define qdict_put_int(options, name, num_val) qdict_put_obj(options, name, QOBJECT(qint_from_int(num_val)))
  22. #define qdict_put_str(options, name, value) qdict_put_obj(options, name, QOBJECT(qstring_from_str(value)))
  23. #define qobject_unref QDECREF
  24. #endif
  25. #include "qemu_proxy.h"
  26. void qemu_module_dummy(void)
  27. {
  28. }
  29. void DSO_STAMP_FUN(void)
  30. {
  31. }
  32. typedef struct VitastorClient
  33. {
  34. void *proxy;
  35. char *etcd_host;
  36. char *etcd_prefix;
  37. uint64_t inode;
  38. uint64_t pool;
  39. uint64_t size;
  40. int readonly;
  41. QemuMutex mutex;
  42. } VitastorClient;
  43. typedef struct VitastorRPC
  44. {
  45. BlockDriverState *bs;
  46. Coroutine *co;
  47. QEMUIOVector *iov;
  48. int ret;
  49. int complete;
  50. } VitastorRPC;
  51. static char *qemu_rbd_next_tok(char *src, char delim, char **p)
  52. {
  53. char *end;
  54. *p = NULL;
  55. for (end = src; *end; ++end)
  56. {
  57. if (*end == delim)
  58. break;
  59. if (*end == '\\' && end[1] != '\0')
  60. end++;
  61. }
  62. if (*end == delim)
  63. {
  64. *p = end + 1;
  65. *end = '\0';
  66. }
  67. return src;
  68. }
  69. static void qemu_rbd_unescape(char *src)
  70. {
  71. char *p;
  72. for (p = src; *src; ++src, ++p)
  73. {
  74. if (*src == '\\' && src[1] != '\0')
  75. src++;
  76. *p = *src;
  77. }
  78. *p = '\0';
  79. }
  80. // vitastor[:key=value]*
  81. // vitastor:etcd_host=127.0.0.1:inode=1:pool=1
  82. static void vitastor_parse_filename(const char *filename, QDict *options, Error **errp)
  83. {
  84. const char *start;
  85. char *p, *buf;
  86. if (!strstart(filename, "vitastor:", &start))
  87. {
  88. error_setg(errp, "File name must start with 'vitastor:'");
  89. return;
  90. }
  91. buf = g_strdup(start);
  92. p = buf;
  93. // The following are all key/value pairs
  94. while (p)
  95. {
  96. char *name, *value;
  97. name = qemu_rbd_next_tok(p, '=', &p);
  98. if (!p)
  99. {
  100. error_setg(errp, "conf option %s has no value", name);
  101. break;
  102. }
  103. qemu_rbd_unescape(name);
  104. value = qemu_rbd_next_tok(p, ':', &p);
  105. qemu_rbd_unescape(value);
  106. if (!strcmp(name, "inode") || !strcmp(name, "pool") || !strcmp(name, "size"))
  107. {
  108. unsigned long long num_val;
  109. if (parse_uint_full(value, &num_val, 0))
  110. {
  111. error_setg(errp, "Illegal %s: %s", name, value);
  112. goto out;
  113. }
  114. qdict_put_int(options, name, num_val);
  115. }
  116. else
  117. {
  118. qdict_put_str(options, name, value);
  119. }
  120. }
  121. if (!qdict_get_try_int(options, "inode", 0))
  122. {
  123. error_setg(errp, "inode is missing");
  124. goto out;
  125. }
  126. if (!(qdict_get_try_int(options, "inode", 0) >> (64-POOL_ID_BITS)) &&
  127. !qdict_get_try_int(options, "pool", 0))
  128. {
  129. error_setg(errp, "pool number is missing");
  130. goto out;
  131. }
  132. if (!qdict_get_try_int(options, "size", 0))
  133. {
  134. error_setg(errp, "size is missing");
  135. goto out;
  136. }
  137. if (!qdict_get_str(options, "etcd_host"))
  138. {
  139. error_setg(errp, "etcd_host is missing");
  140. goto out;
  141. }
  142. out:
  143. g_free(buf);
  144. return;
  145. }
  146. static int vitastor_file_open(BlockDriverState *bs, QDict *options, int flags, Error **errp)
  147. {
  148. VitastorClient *client = bs->opaque;
  149. int64_t ret = 0;
  150. client->etcd_host = g_strdup(qdict_get_try_str(options, "etcd_host"));
  151. client->etcd_prefix = g_strdup(qdict_get_try_str(options, "etcd_prefix"));
  152. client->inode = qdict_get_int(options, "inode");
  153. client->pool = qdict_get_int(options, "pool");
  154. if (client->pool)
  155. client->inode = (client->inode & ((1l << (64-POOL_ID_BITS)) - 1)) | (client->pool << (64-POOL_ID_BITS));
  156. client->size = qdict_get_int(options, "size");
  157. client->readonly = (flags & BDRV_O_RDWR) ? 1 : 0;
  158. client->proxy = vitastor_proxy_create(bdrv_get_aio_context(bs), client->etcd_host, client->etcd_prefix);
  159. //client->aio_context = bdrv_get_aio_context(bs);
  160. bs->total_sectors = client->size / BDRV_SECTOR_SIZE;
  161. qdict_del(options, "etcd_host");
  162. qdict_del(options, "etcd_prefix");
  163. qdict_del(options, "inode");
  164. qdict_del(options, "pool");
  165. qdict_del(options, "size");
  166. qemu_mutex_init(&client->mutex);
  167. return ret;
  168. }
  169. static void vitastor_close(BlockDriverState *bs)
  170. {
  171. VitastorClient *client = bs->opaque;
  172. vitastor_proxy_destroy(client->proxy);
  173. qemu_mutex_destroy(&client->mutex);
  174. g_free(client->etcd_host);
  175. if (client->etcd_prefix)
  176. g_free(client->etcd_prefix);
  177. }
  178. #if QEMU_VERSION_MAJOR >= 3
  179. static int vitastor_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz)
  180. {
  181. bsz->phys = 4096;
  182. bsz->log = 4096;
  183. return 0;
  184. }
  185. #endif
  186. static int coroutine_fn vitastor_co_create_opts(
  187. #if QEMU_VERSION_MAJOR >= 4
  188. BlockDriver *drv,
  189. #endif
  190. const char *url, QemuOpts *opts, Error **errp)
  191. {
  192. QDict *options;
  193. int ret;
  194. options = qdict_new();
  195. vitastor_parse_filename(url, options, errp);
  196. if (*errp)
  197. {
  198. ret = -1;
  199. goto out;
  200. }
  201. // inodes don't require creation in Vitastor. FIXME: They will when there will be some metadata
  202. ret = 0;
  203. out:
  204. qobject_unref(options);
  205. return ret;
  206. }
  207. #if QEMU_VERSION_MAJOR >= 3
  208. static int coroutine_fn vitastor_co_truncate(BlockDriverState *bs, int64_t offset,
  209. #if QEMU_VERSION_MAJOR >= 4
  210. bool exact,
  211. #endif
  212. PreallocMode prealloc,
  213. #if QEMU_VERSION_MAJOR >= 5 && QEMU_VERSION_MINOR >= 1 || QEMU_VERSION_MAJOR > 5
  214. BdrvRequestFlags flags,
  215. #endif
  216. Error **errp)
  217. {
  218. VitastorClient *client = bs->opaque;
  219. if (prealloc != PREALLOC_MODE_OFF)
  220. {
  221. error_setg(errp, "Unsupported preallocation mode '%s'", PreallocMode_str(prealloc));
  222. return -ENOTSUP;
  223. }
  224. // TODO: Resize inode to <offset> bytes
  225. client->size = offset / BDRV_SECTOR_SIZE;
  226. return 0;
  227. }
  228. #endif
  229. static int vitastor_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
  230. {
  231. bdi->cluster_size = 4096;
  232. return 0;
  233. }
  234. static int64_t vitastor_getlength(BlockDriverState *bs)
  235. {
  236. VitastorClient *client = bs->opaque;
  237. return client->size;
  238. }
  239. #if QEMU_VERSION_MAJOR >= 3
  240. static void vitastor_refresh_limits(BlockDriverState *bs, Error **errp)
  241. #else
  242. static int vitastor_refresh_limits(BlockDriverState *bs)
  243. #endif
  244. {
  245. #if QEMU_VERSION_MAJOR >= 4
  246. bs->bl.request_alignment = 4096;
  247. bs->bl.min_mem_alignment = 4096;
  248. #else
  249. bs->request_alignment = 4096;
  250. #endif
  251. bs->bl.opt_mem_alignment = 4096;
  252. #if QEMU_VERSION_MAJOR < 3
  253. return 0;
  254. #endif
  255. }
  256. static int64_t vitastor_get_allocated_file_size(BlockDriverState *bs)
  257. {
  258. return 0;
  259. }
  260. static void vitastor_co_init_task(BlockDriverState *bs, VitastorRPC *task)
  261. {
  262. *task = (VitastorRPC) {
  263. .co = qemu_coroutine_self(),
  264. .bs = bs,
  265. };
  266. }
  267. static void vitastor_co_generic_bh_cb(int retval, void *opaque)
  268. {
  269. VitastorRPC *task = opaque;
  270. task->ret = retval;
  271. task->complete = 1;
  272. if (qemu_coroutine_self() != task->co)
  273. {
  274. #if QEMU_VERSION_MAJOR >= 3
  275. aio_co_wake(task->co);
  276. #else
  277. qemu_coroutine_enter(task->co, NULL);
  278. qemu_aio_release(task);
  279. #endif
  280. }
  281. }
  282. static int coroutine_fn vitastor_co_preadv(BlockDriverState *bs, uint64_t offset, uint64_t bytes, QEMUIOVector *iov, int flags)
  283. {
  284. VitastorClient *client = bs->opaque;
  285. VitastorRPC task;
  286. vitastor_co_init_task(bs, &task);
  287. task.iov = iov;
  288. qemu_mutex_lock(&client->mutex);
  289. vitastor_proxy_rw(0, client->proxy, client->inode, offset, bytes, iov->iov, iov->niov, vitastor_co_generic_bh_cb, &task);
  290. qemu_mutex_unlock(&client->mutex);
  291. while (!task.complete)
  292. {
  293. qemu_coroutine_yield();
  294. }
  295. return task.ret;
  296. }
  297. static int coroutine_fn vitastor_co_pwritev(BlockDriverState *bs, uint64_t offset, uint64_t bytes, QEMUIOVector *iov, int flags)
  298. {
  299. VitastorClient *client = bs->opaque;
  300. VitastorRPC task;
  301. vitastor_co_init_task(bs, &task);
  302. task.iov = iov;
  303. qemu_mutex_lock(&client->mutex);
  304. vitastor_proxy_rw(1, client->proxy, client->inode, offset, bytes, iov->iov, iov->niov, vitastor_co_generic_bh_cb, &task);
  305. qemu_mutex_unlock(&client->mutex);
  306. while (!task.complete)
  307. {
  308. qemu_coroutine_yield();
  309. }
  310. return task.ret;
  311. }
  312. #if QEMU_VERSION_MAJOR < 3
  313. static int coroutine_fn vitastor_co_readv(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *iov)
  314. {
  315. return vitastor_co_preadv(bs, sector_num*BDRV_SECTOR_SIZE, nb_sectors*BDRV_SECTOR_SIZE, iov, 0);
  316. }
  317. static int coroutine_fn vitastor_co_writev(BlockDriverState *bs, int64_t sector_num, int nb_sectors, QEMUIOVector *iov)
  318. {
  319. return vitastor_co_pwritev(bs, sector_num*BDRV_SECTOR_SIZE, nb_sectors*BDRV_SECTOR_SIZE, iov, 0);
  320. }
  321. #endif
  322. static int coroutine_fn vitastor_co_flush(BlockDriverState *bs)
  323. {
  324. VitastorClient *client = bs->opaque;
  325. VitastorRPC task;
  326. vitastor_co_init_task(bs, &task);
  327. qemu_mutex_lock(&client->mutex);
  328. vitastor_proxy_sync(client->proxy, vitastor_co_generic_bh_cb, &task);
  329. qemu_mutex_unlock(&client->mutex);
  330. while (!task.complete)
  331. {
  332. qemu_coroutine_yield();
  333. }
  334. return task.ret;
  335. }
  336. #if QEMU_VERSION_MAJOR >= 3
  337. static QemuOptsList vitastor_create_opts = {
  338. .name = "vitastor-create-opts",
  339. .head = QTAILQ_HEAD_INITIALIZER(vitastor_create_opts.head),
  340. .desc = {
  341. {
  342. .name = BLOCK_OPT_SIZE,
  343. .type = QEMU_OPT_SIZE,
  344. .help = "Virtual disk size"
  345. },
  346. { /* end of list */ }
  347. }
  348. };
  349. #else
  350. static QEMUOptionParameter vitastor_create_opts[] = {
  351. {
  352. .name = BLOCK_OPT_SIZE,
  353. .type = OPT_SIZE,
  354. .help = "Virtual disk size"
  355. },
  356. { NULL }
  357. };
  358. #endif
  359. static const char *vitastor_strong_runtime_opts[] = {
  360. "inode",
  361. "pool",
  362. "etcd_host",
  363. "etcd_prefix",
  364. NULL
  365. };
  366. static BlockDriver bdrv_vitastor = {
  367. .format_name = "vitastor",
  368. .protocol_name = "vitastor",
  369. .instance_size = sizeof(VitastorClient),
  370. .bdrv_parse_filename = vitastor_parse_filename,
  371. .bdrv_has_zero_init = bdrv_has_zero_init_1,
  372. .bdrv_get_info = vitastor_get_info,
  373. .bdrv_getlength = vitastor_getlength,
  374. #if QEMU_VERSION_MAJOR >= 3
  375. .bdrv_probe_blocksizes = vitastor_probe_blocksizes,
  376. #endif
  377. .bdrv_refresh_limits = vitastor_refresh_limits,
  378. // FIXME: Implement it along with per-inode statistics
  379. //.bdrv_get_allocated_file_size = vitastor_get_allocated_file_size,
  380. .bdrv_file_open = vitastor_file_open,
  381. .bdrv_close = vitastor_close,
  382. // Option list for the create operation
  383. #if QEMU_VERSION_MAJOR >= 3
  384. .create_opts = &vitastor_create_opts,
  385. #else
  386. .create_options = vitastor_create_opts,
  387. #endif
  388. // For qmp_blockdev_create(), used by the qemu monitor / QAPI
  389. // Requires patching QAPI IDL, thus unimplemented
  390. //.bdrv_co_create = vitastor_co_create,
  391. #if QEMU_VERSION_MAJOR >= 3
  392. // For bdrv_create(), used by qemu-img
  393. .bdrv_co_create_opts = vitastor_co_create_opts,
  394. .bdrv_co_truncate = vitastor_co_truncate,
  395. .bdrv_co_preadv = vitastor_co_preadv,
  396. .bdrv_co_pwritev = vitastor_co_pwritev,
  397. #else
  398. .bdrv_co_readv = vitastor_co_readv,
  399. .bdrv_co_writev = vitastor_co_writev,
  400. #endif
  401. .bdrv_co_flush_to_disk = vitastor_co_flush,
  402. #if QEMU_VERSION_MAJOR >= 4
  403. .strong_runtime_opts = vitastor_strong_runtime_opts,
  404. #endif
  405. };
  406. static void vitastor_block_init(void)
  407. {
  408. bdrv_register(&bdrv_vitastor);
  409. }
  410. block_init(vitastor_block_init);