Simplified distributed block storage with strong consistency, like in Ceph
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

453 lines
12 KiB

  1. // Copyright (c) Vitaliy Filippov, 2019+
  2. // License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
  3. // FIO engine to test cluster I/O
  4. //
  5. // Random write:
  6. //
  7. // fio -thread -ioengine=./libfio_cluster.so -name=test -bs=4k -direct=1 -fsync=16 -iodepth=16 -rw=randwrite \
  8. // -etcd=127.0.0.1:2379 [-etcd_prefix=/vitastor] (-image=testimg | -pool=1 -inode=1 -size=1000M)
  9. //
  10. // Linear write:
  11. //
  12. // fio -thread -ioengine=./libfio_cluster.so -name=test -bs=128k -direct=1 -fsync=32 -iodepth=32 -rw=write \
  13. // -etcd=127.0.0.1:2379 [-etcd_prefix=/vitastor] -image=testimg
  14. //
  15. // Random read (run with -iodepth=32 or -iodepth=1):
  16. //
  17. // fio -thread -ioengine=./libfio_cluster.so -name=test -bs=4k -direct=1 -iodepth=32 -rw=randread \
  18. // -etcd=127.0.0.1:2379 [-etcd_prefix=/vitastor] -image=testimg
  19. #include <sys/types.h>
  20. #include <sys/socket.h>
  21. #include <netinet/in.h>
  22. #include <netinet/tcp.h>
  23. #include <vector>
  24. #include "epoll_manager.h"
  25. #include "cluster_client.h"
  26. #include "fio_headers.h"
  27. struct sec_data
  28. {
  29. ring_loop_t *ringloop = NULL;
  30. epoll_manager_t *epmgr = NULL;
  31. cluster_client_t *cli = NULL;
  32. inode_watch_t *watch = NULL;
  33. bool last_sync = false;
  34. /* The list of completed io_u structs. */
  35. std::vector<io_u*> completed;
  36. uint64_t op_n = 0, inflight = 0;
  37. bool trace = false;
  38. };
  39. struct sec_options
  40. {
  41. int __pad;
  42. char *config_path = NULL;
  43. char *etcd_host = NULL;
  44. char *etcd_prefix = NULL;
  45. char *image = NULL;
  46. uint64_t pool = 0;
  47. uint64_t inode = 0;
  48. int cluster_log = 0;
  49. int trace = 0;
  50. int use_rdma = 0;
  51. char *rdma_device = NULL;
  52. int rdma_port_num = 0;
  53. int rdma_gid_index = 0;
  54. int rdma_mtu = 0;
  55. };
  56. static struct fio_option options[] = {
  57. {
  58. .name = "conf",
  59. .lname = "Vitastor config path",
  60. .type = FIO_OPT_STR_STORE,
  61. .off1 = offsetof(struct sec_options, config_path),
  62. .help = "Vitastor config path",
  63. .category = FIO_OPT_C_ENGINE,
  64. .group = FIO_OPT_G_FILENAME,
  65. },
  66. {
  67. .name = "etcd",
  68. .lname = "etcd address",
  69. .type = FIO_OPT_STR_STORE,
  70. .off1 = offsetof(struct sec_options, etcd_host),
  71. .help = "etcd address in the form HOST:PORT[/PATH]",
  72. .category = FIO_OPT_C_ENGINE,
  73. .group = FIO_OPT_G_FILENAME,
  74. },
  75. {
  76. .name = "etcd_prefix",
  77. .lname = "etcd key prefix",
  78. .type = FIO_OPT_STR_STORE,
  79. .off1 = offsetof(struct sec_options, etcd_prefix),
  80. .help = "etcd key prefix, by default /vitastor",
  81. .category = FIO_OPT_C_ENGINE,
  82. .group = FIO_OPT_G_FILENAME,
  83. },
  84. {
  85. .name = "image",
  86. .lname = "Vitastor image name",
  87. .type = FIO_OPT_STR_STORE,
  88. .off1 = offsetof(struct sec_options, image),
  89. .help = "Vitastor image name to run tests on",
  90. .category = FIO_OPT_C_ENGINE,
  91. .group = FIO_OPT_G_FILENAME,
  92. },
  93. {
  94. .name = "pool",
  95. .lname = "pool number for the inode",
  96. .type = FIO_OPT_INT,
  97. .off1 = offsetof(struct sec_options, pool),
  98. .help = "pool number for the inode to run tests on",
  99. .category = FIO_OPT_C_ENGINE,
  100. .group = FIO_OPT_G_FILENAME,
  101. },
  102. {
  103. .name = "inode",
  104. .lname = "inode to run tests on",
  105. .type = FIO_OPT_INT,
  106. .off1 = offsetof(struct sec_options, inode),
  107. .help = "inode number to run tests on",
  108. .category = FIO_OPT_C_ENGINE,
  109. .group = FIO_OPT_G_FILENAME,
  110. },
  111. {
  112. .name = "cluster_log_level",
  113. .lname = "cluster log level",
  114. .type = FIO_OPT_INT,
  115. .off1 = offsetof(struct sec_options, cluster_log),
  116. .help = "Set log level for the Vitastor client",
  117. .def = "0",
  118. .category = FIO_OPT_C_ENGINE,
  119. .group = FIO_OPT_G_FILENAME,
  120. },
  121. {
  122. .name = "osd_trace",
  123. .lname = "OSD trace",
  124. .type = FIO_OPT_BOOL,
  125. .off1 = offsetof(struct sec_options, trace),
  126. .help = "Trace OSD operations",
  127. .def = "0",
  128. .category = FIO_OPT_C_ENGINE,
  129. .group = FIO_OPT_G_FILENAME,
  130. },
  131. {
  132. .name = "use_rdma",
  133. .lname = "Use RDMA",
  134. .type = FIO_OPT_BOOL,
  135. .off1 = offsetof(struct sec_options, use_rdma),
  136. .help = "Use RDMA",
  137. .def = "-1",
  138. .category = FIO_OPT_C_ENGINE,
  139. .group = FIO_OPT_G_FILENAME,
  140. },
  141. {
  142. .name = "rdma_device",
  143. .lname = "RDMA device name",
  144. .type = FIO_OPT_STR_STORE,
  145. .off1 = offsetof(struct sec_options, rdma_device),
  146. .help = "RDMA device name",
  147. .category = FIO_OPT_C_ENGINE,
  148. .group = FIO_OPT_G_FILENAME,
  149. },
  150. {
  151. .name = "rdma_port_num",
  152. .lname = "RDMA port number",
  153. .type = FIO_OPT_INT,
  154. .off1 = offsetof(struct sec_options, rdma_port_num),
  155. .help = "RDMA port number",
  156. .def = "0",
  157. .category = FIO_OPT_C_ENGINE,
  158. .group = FIO_OPT_G_FILENAME,
  159. },
  160. {
  161. .name = "rdma_gid_index",
  162. .lname = "RDMA gid index",
  163. .type = FIO_OPT_INT,
  164. .off1 = offsetof(struct sec_options, rdma_gid_index),
  165. .help = "RDMA gid index",
  166. .def = "0",
  167. .category = FIO_OPT_C_ENGINE,
  168. .group = FIO_OPT_G_FILENAME,
  169. },
  170. {
  171. .name = "rdma_mtu",
  172. .lname = "RDMA path MTU",
  173. .type = FIO_OPT_INT,
  174. .off1 = offsetof(struct sec_options, rdma_mtu),
  175. .help = "RDMA path MTU",
  176. .def = "0",
  177. .category = FIO_OPT_C_ENGINE,
  178. .group = FIO_OPT_G_FILENAME,
  179. },
  180. {
  181. .name = NULL,
  182. },
  183. };
  184. static int sec_setup(struct thread_data *td)
  185. {
  186. sec_options *o = (sec_options*)td->eo;
  187. sec_data *bsd;
  188. bsd = new sec_data;
  189. if (!bsd)
  190. {
  191. td_verror(td, errno, "calloc");
  192. return 1;
  193. }
  194. td->io_ops_data = bsd;
  195. if (!td->files_index)
  196. {
  197. add_file(td, "osd_cluster", 0, 0);
  198. td->o.nr_files = td->o.nr_files ? : 1;
  199. td->o.open_files++;
  200. }
  201. json11::Json::object cfg;
  202. if (o->config_path)
  203. cfg["config_path"] = std::string(o->config_path);
  204. if (o->etcd_host)
  205. cfg["etcd_address"] = std::string(o->etcd_host);
  206. if (o->etcd_prefix)
  207. cfg["etcd_prefix"] = std::string(o->etcd_prefix);
  208. if (o->rdma_device)
  209. cfg["rdma_device"] = std::string(o->rdma_device);
  210. if (o->rdma_port_num)
  211. cfg["rdma_port_num"] = o->rdma_port_num;
  212. if (o->rdma_gid_index)
  213. cfg["rdma_gid_index"] = o->rdma_gid_index;
  214. if (o->rdma_mtu)
  215. cfg["rdma_mtu"] = o->rdma_mtu;
  216. if (o->cluster_log)
  217. cfg["log_level"] = o->cluster_log;
  218. if (o->use_rdma != -1)
  219. cfg["use_rdma"] = o->use_rdma;
  220. json11::Json cfg_json(cfg);
  221. if (!o->image)
  222. {
  223. if (!(o->inode & ((1l << (64-POOL_ID_BITS)) - 1)))
  224. {
  225. td_verror(td, EINVAL, "inode number is missing");
  226. return 1;
  227. }
  228. if (o->pool)
  229. {
  230. o->inode = (o->inode & ((1l << (64-POOL_ID_BITS)) - 1)) | (o->pool << (64-POOL_ID_BITS));
  231. }
  232. if (!(o->inode >> (64-POOL_ID_BITS)))
  233. {
  234. td_verror(td, EINVAL, "pool is missing");
  235. return 1;
  236. }
  237. }
  238. else
  239. {
  240. o->inode = 0;
  241. }
  242. bsd->ringloop = new ring_loop_t(512);
  243. bsd->epmgr = new epoll_manager_t(bsd->ringloop);
  244. bsd->cli = new cluster_client_t(bsd->ringloop, bsd->epmgr->tfd, cfg_json);
  245. if (o->image)
  246. {
  247. while (!bsd->cli->is_ready())
  248. {
  249. bsd->ringloop->loop();
  250. if (bsd->cli->is_ready())
  251. break;
  252. bsd->ringloop->wait();
  253. }
  254. bsd->watch = bsd->cli->st_cli.watch_inode(std::string(o->image));
  255. td->files[0]->real_file_size = bsd->watch->cfg.size;
  256. }
  257. bsd->trace = o->trace ? true : false;
  258. return 0;
  259. }
  260. static void sec_cleanup(struct thread_data *td)
  261. {
  262. sec_data *bsd = (sec_data*)td->io_ops_data;
  263. if (bsd)
  264. {
  265. if (bsd->watch)
  266. {
  267. bsd->cli->st_cli.close_watch(bsd->watch);
  268. }
  269. delete bsd->cli;
  270. delete bsd->epmgr;
  271. delete bsd->ringloop;
  272. delete bsd;
  273. }
  274. }
  275. /* Connect to the server from each thread. */
  276. static int sec_init(struct thread_data *td)
  277. {
  278. return 0;
  279. }
  280. /* Begin read or write request. */
  281. static enum fio_q_status sec_queue(struct thread_data *td, struct io_u *io)
  282. {
  283. sec_options *opt = (sec_options*)td->eo;
  284. sec_data *bsd = (sec_data*)td->io_ops_data;
  285. int n = bsd->op_n;
  286. fio_ro_check(td, io);
  287. if (io->ddir == DDIR_SYNC && bsd->last_sync)
  288. {
  289. return FIO_Q_COMPLETED;
  290. }
  291. io->engine_data = bsd;
  292. cluster_op_t *op = new cluster_op_t;
  293. op->inode = opt->image ? bsd->watch->cfg.num : opt->inode;
  294. switch (io->ddir)
  295. {
  296. case DDIR_READ:
  297. op->opcode = OSD_OP_READ;
  298. op->offset = io->offset;
  299. op->len = io->xfer_buflen;
  300. op->iov.push_back(io->xfer_buf, io->xfer_buflen);
  301. bsd->last_sync = false;
  302. break;
  303. case DDIR_WRITE:
  304. if (opt->image && bsd->watch->cfg.readonly)
  305. {
  306. io->error = EROFS;
  307. return FIO_Q_COMPLETED;
  308. }
  309. op->opcode = OSD_OP_WRITE;
  310. op->offset = io->offset;
  311. op->len = io->xfer_buflen;
  312. op->iov.push_back(io->xfer_buf, io->xfer_buflen);
  313. bsd->last_sync = false;
  314. break;
  315. case DDIR_SYNC:
  316. op->opcode = OSD_OP_SYNC;
  317. bsd->last_sync = true;
  318. break;
  319. default:
  320. io->error = EINVAL;
  321. return FIO_Q_COMPLETED;
  322. }
  323. op->callback = [io, n](cluster_op_t *op)
  324. {
  325. io->error = op->retval < 0 ? -op->retval : 0;
  326. sec_data *bsd = (sec_data*)io->engine_data;
  327. bsd->inflight--;
  328. bsd->completed.push_back(io);
  329. if (bsd->trace)
  330. {
  331. printf("--- %s n=%d retval=%d\n", io->ddir == DDIR_READ ? "READ" :
  332. (io->ddir == DDIR_WRITE ? "WRITE" : "SYNC"), n, op->retval);
  333. }
  334. delete op;
  335. };
  336. if (opt->trace)
  337. {
  338. if (io->ddir == DDIR_SYNC)
  339. {
  340. printf("+++ SYNC # %d\n", n);
  341. }
  342. else
  343. {
  344. printf("+++ %s # %d 0x%llx+%llx\n",
  345. io->ddir == DDIR_READ ? "READ" : "WRITE",
  346. n, io->offset, io->xfer_buflen);
  347. }
  348. }
  349. io->error = 0;
  350. bsd->inflight++;
  351. bsd->op_n++;
  352. bsd->cli->execute(op);
  353. if (io->error != 0)
  354. return FIO_Q_COMPLETED;
  355. return FIO_Q_QUEUED;
  356. }
  357. static int sec_getevents(struct thread_data *td, unsigned int min, unsigned int max, const struct timespec *t)
  358. {
  359. sec_data *bsd = (sec_data*)td->io_ops_data;
  360. while (true)
  361. {
  362. bsd->ringloop->loop();
  363. if (bsd->completed.size() >= min)
  364. break;
  365. bsd->ringloop->wait();
  366. }
  367. return bsd->completed.size();
  368. }
  369. static struct io_u *sec_event(struct thread_data *td, int event)
  370. {
  371. sec_data *bsd = (sec_data*)td->io_ops_data;
  372. if (bsd->completed.size() == 0)
  373. return NULL;
  374. /* FIXME We ignore the event number and assume fio calls us exactly once for [0..nr_events-1] */
  375. struct io_u *ev = bsd->completed.back();
  376. bsd->completed.pop_back();
  377. return ev;
  378. }
  379. static int sec_io_u_init(struct thread_data *td, struct io_u *io)
  380. {
  381. io->engine_data = NULL;
  382. return 0;
  383. }
  384. static void sec_io_u_free(struct thread_data *td, struct io_u *io)
  385. {
  386. }
  387. static int sec_open_file(struct thread_data *td, struct fio_file *f)
  388. {
  389. return 0;
  390. }
  391. static int sec_invalidate(struct thread_data *td, struct fio_file *f)
  392. {
  393. return 0;
  394. }
  395. struct ioengine_ops ioengine = {
  396. .name = "vitastor_cluster",
  397. .version = FIO_IOOPS_VERSION,
  398. .flags = FIO_MEMALIGN | FIO_DISKLESSIO | FIO_NOEXTEND,
  399. .setup = sec_setup,
  400. .init = sec_init,
  401. .queue = sec_queue,
  402. .getevents = sec_getevents,
  403. .event = sec_event,
  404. .cleanup = sec_cleanup,
  405. .open_file = sec_open_file,
  406. .invalidate = sec_invalidate,
  407. .io_u_init = sec_io_u_init,
  408. .io_u_free = sec_io_u_free,
  409. .option_struct_size = sizeof(struct sec_options),
  410. .options = options,
  411. };
  412. static void fio_init fio_sec_register(void)
  413. {
  414. register_ioengine(&ioengine);
  415. }
  416. static void fio_exit fio_sec_unregister(void)
  417. {
  418. unregister_ioengine(&ioengine);
  419. }