Simplified distributed block storage with strong consistency, like in Ceph
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

435 lines
15 KiB

  1. // Copyright (c) Vitaliy Filippov, 2019+
  2. // License: VNPL-1.1 (see README.md for details)
  3. #include <sys/socket.h>
  4. #include <sys/poll.h>
  5. #include <netinet/in.h>
  6. #include <netinet/tcp.h>
  7. #include <arpa/inet.h>
  8. #include "osd.h"
  9. #include "http_client.h"
  10. static blockstore_config_t json_to_bs(const json11::Json::object & config)
  11. {
  12. blockstore_config_t bs;
  13. for (auto kv: config)
  14. {
  15. if (kv.second.is_string())
  16. bs[kv.first] = kv.second.string_value();
  17. else
  18. bs[kv.first] = kv.second.dump();
  19. }
  20. return bs;
  21. }
  22. osd_t::osd_t(const json11::Json & config, ring_loop_t *ringloop)
  23. {
  24. zero_buffer_size = 1<<20;
  25. zero_buffer = malloc_or_die(zero_buffer_size);
  26. memset(zero_buffer, 0, zero_buffer_size);
  27. this->ringloop = ringloop;
  28. this->config = msgr.read_config(config).object_items();
  29. if (this->config.find("log_level") == this->config.end())
  30. this->config["log_level"] = 1;
  31. parse_config(this->config);
  32. epmgr = new epoll_manager_t(ringloop);
  33. // FIXME: Use timerfd_interval based directly on io_uring
  34. this->tfd = epmgr->tfd;
  35. // FIXME: Create Blockstore from on-disk superblock config and check it against the OSD cluster config
  36. auto bs_cfg = json_to_bs(this->config);
  37. this->bs = new blockstore_t(bs_cfg, ringloop, tfd);
  38. this->tfd->set_timer(print_stats_interval*1000, true, [this](int timer_id)
  39. {
  40. print_stats();
  41. });
  42. this->tfd->set_timer(slow_log_interval*1000, true, [this](int timer_id)
  43. {
  44. print_slow();
  45. });
  46. msgr.tfd = this->tfd;
  47. msgr.ringloop = this->ringloop;
  48. msgr.exec_op = [this](osd_op_t *op) { exec_op(op); };
  49. msgr.repeer_pgs = [this](osd_num_t peer_osd) { repeer_pgs(peer_osd); };
  50. msgr.init();
  51. init_cluster();
  52. consumer.loop = [this]() { loop(); };
  53. ringloop->register_consumer(&consumer);
  54. }
  55. osd_t::~osd_t()
  56. {
  57. ringloop->unregister_consumer(&consumer);
  58. delete epmgr;
  59. delete bs;
  60. close(listen_fd);
  61. free(zero_buffer);
  62. }
  63. void osd_t::parse_config(const json11::Json & config)
  64. {
  65. st_cli.parse_config(config);
  66. msgr.parse_config(config);
  67. // OSD number
  68. osd_num = config["osd_num"].uint64_value();
  69. if (!osd_num)
  70. throw std::runtime_error("osd_num is required in the configuration");
  71. msgr.osd_num = osd_num;
  72. // Vital Blockstore parameters
  73. bs_block_size = config["block_size"].uint64_value();
  74. if (!bs_block_size)
  75. bs_block_size = DEFAULT_BLOCK_SIZE;
  76. bs_bitmap_granularity = config["bitmap_granularity"].uint64_value();
  77. if (!bs_bitmap_granularity)
  78. bs_bitmap_granularity = DEFAULT_BITMAP_GRANULARITY;
  79. clean_entry_bitmap_size = bs_block_size / bs_bitmap_granularity / 8;
  80. // Bind address
  81. bind_address = config["bind_address"].string_value();
  82. if (bind_address == "")
  83. bind_address = "0.0.0.0";
  84. bind_port = config["bind_port"].uint64_value();
  85. if (bind_port <= 0 || bind_port > 65535)
  86. bind_port = 0;
  87. // OSD configuration
  88. log_level = config["log_level"].uint64_value();
  89. etcd_report_interval = config["etcd_report_interval"].uint64_value();
  90. if (etcd_report_interval <= 0)
  91. etcd_report_interval = 30;
  92. readonly = config["readonly"] == "true" || config["readonly"] == "1" || config["readonly"] == "yes";
  93. run_primary = config["run_primary"] != "false" && config["run_primary"] != "0" && config["run_primary"] != "no";
  94. no_rebalance = config["no_rebalance"] == "true" || config["no_rebalance"] == "1" || config["no_rebalance"] == "yes";
  95. no_recovery = config["no_recovery"] == "true" || config["no_recovery"] == "1" || config["no_recovery"] == "yes";
  96. allow_test_ops = config["allow_test_ops"] == "true" || config["allow_test_ops"] == "1" || config["allow_test_ops"] == "yes";
  97. if (config["immediate_commit"] == "all")
  98. immediate_commit = IMMEDIATE_ALL;
  99. else if (config["immediate_commit"] == "small")
  100. immediate_commit = IMMEDIATE_SMALL;
  101. else
  102. immediate_commit = IMMEDIATE_NONE;
  103. if (!config["autosync_interval"].is_null())
  104. {
  105. // Allow to set it to 0
  106. autosync_interval = config["autosync_interval"].uint64_value();
  107. if (autosync_interval > MAX_AUTOSYNC_INTERVAL)
  108. autosync_interval = DEFAULT_AUTOSYNC_INTERVAL;
  109. }
  110. if (!config["client_queue_depth"].is_null())
  111. {
  112. client_queue_depth = config["client_queue_depth"].uint64_value();
  113. if (client_queue_depth < 128)
  114. client_queue_depth = 128;
  115. }
  116. recovery_queue_depth = config["recovery_queue_depth"].uint64_value();
  117. if (recovery_queue_depth < 1 || recovery_queue_depth > MAX_RECOVERY_QUEUE)
  118. recovery_queue_depth = DEFAULT_RECOVERY_QUEUE;
  119. recovery_sync_batch = config["recovery_sync_batch"].uint64_value();
  120. if (recovery_sync_batch < 1 || recovery_sync_batch > MAX_RECOVERY_QUEUE)
  121. recovery_sync_batch = DEFAULT_RECOVERY_BATCH;
  122. print_stats_interval = config["print_stats_interval"].uint64_value();
  123. if (!print_stats_interval)
  124. print_stats_interval = 3;
  125. slow_log_interval = config["slow_log_interval"].uint64_value();
  126. if (!slow_log_interval)
  127. slow_log_interval = 10;
  128. }
  129. void osd_t::bind_socket()
  130. {
  131. listen_fd = socket(AF_INET, SOCK_STREAM, 0);
  132. if (listen_fd < 0)
  133. {
  134. throw std::runtime_error(std::string("socket: ") + strerror(errno));
  135. }
  136. int enable = 1;
  137. setsockopt(listen_fd, SOL_SOCKET, SO_REUSEADDR, &enable, sizeof(enable));
  138. sockaddr_in addr;
  139. int r;
  140. if ((r = inet_pton(AF_INET, bind_address.c_str(), &addr.sin_addr)) != 1)
  141. {
  142. close(listen_fd);
  143. throw std::runtime_error("bind address "+bind_address+(r == 0 ? " is not valid" : ": no ipv4 support"));
  144. }
  145. addr.sin_family = AF_INET;
  146. addr.sin_port = htons(bind_port);
  147. if (bind(listen_fd, (sockaddr*)&addr, sizeof(addr)) < 0)
  148. {
  149. close(listen_fd);
  150. throw std::runtime_error(std::string("bind: ") + strerror(errno));
  151. }
  152. if (bind_port == 0)
  153. {
  154. socklen_t len = sizeof(addr);
  155. if (getsockname(listen_fd, (sockaddr *)&addr, &len) == -1)
  156. {
  157. close(listen_fd);
  158. throw std::runtime_error(std::string("getsockname: ") + strerror(errno));
  159. }
  160. listening_port = ntohs(addr.sin_port);
  161. }
  162. else
  163. {
  164. listening_port = bind_port;
  165. }
  166. if (listen(listen_fd, listen_backlog) < 0)
  167. {
  168. close(listen_fd);
  169. throw std::runtime_error(std::string("listen: ") + strerror(errno));
  170. }
  171. fcntl(listen_fd, F_SETFL, fcntl(listen_fd, F_GETFL, 0) | O_NONBLOCK);
  172. epmgr->set_fd_handler(listen_fd, false, [this](int fd, int events)
  173. {
  174. msgr.accept_connections(listen_fd);
  175. });
  176. }
  177. bool osd_t::shutdown()
  178. {
  179. stopping = true;
  180. if (inflight_ops > 0)
  181. {
  182. return false;
  183. }
  184. return !bs || bs->is_safe_to_stop();
  185. }
  186. void osd_t::loop()
  187. {
  188. handle_peers();
  189. msgr.read_requests();
  190. msgr.send_replies();
  191. ringloop->submit();
  192. }
  193. void osd_t::exec_op(osd_op_t *cur_op)
  194. {
  195. clock_gettime(CLOCK_REALTIME, &cur_op->tv_begin);
  196. if (stopping)
  197. {
  198. // Throw operation away
  199. delete cur_op;
  200. return;
  201. }
  202. // Clear the reply buffer
  203. memset(cur_op->reply.buf, 0, OSD_PACKET_SIZE);
  204. inflight_ops++;
  205. if (cur_op->req.hdr.magic != SECONDARY_OSD_OP_MAGIC ||
  206. cur_op->req.hdr.opcode < OSD_OP_MIN || cur_op->req.hdr.opcode > OSD_OP_MAX ||
  207. ((cur_op->req.hdr.opcode == OSD_OP_SEC_READ ||
  208. cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE ||
  209. cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE_STABLE) &&
  210. (cur_op->req.sec_rw.len > OSD_RW_MAX ||
  211. cur_op->req.sec_rw.len % bs_bitmap_granularity ||
  212. cur_op->req.sec_rw.offset % bs_bitmap_granularity)) ||
  213. ((cur_op->req.hdr.opcode == OSD_OP_READ ||
  214. cur_op->req.hdr.opcode == OSD_OP_WRITE ||
  215. cur_op->req.hdr.opcode == OSD_OP_DELETE) &&
  216. (cur_op->req.rw.len > OSD_RW_MAX ||
  217. cur_op->req.rw.len % bs_bitmap_granularity ||
  218. cur_op->req.rw.offset % bs_bitmap_granularity)))
  219. {
  220. // Bad command
  221. finish_op(cur_op, -EINVAL);
  222. return;
  223. }
  224. if (cur_op->req.hdr.opcode == OSD_OP_PING)
  225. {
  226. // Pong
  227. finish_op(cur_op, 0);
  228. return;
  229. }
  230. if (readonly &&
  231. cur_op->req.hdr.opcode != OSD_OP_SEC_READ &&
  232. cur_op->req.hdr.opcode != OSD_OP_SEC_LIST &&
  233. cur_op->req.hdr.opcode != OSD_OP_READ &&
  234. cur_op->req.hdr.opcode != OSD_OP_SEC_READ_BMP &&
  235. cur_op->req.hdr.opcode != OSD_OP_SHOW_CONFIG)
  236. {
  237. // Readonly mode
  238. finish_op(cur_op, -EROFS);
  239. return;
  240. }
  241. if (cur_op->req.hdr.opcode == OSD_OP_TEST_SYNC_STAB_ALL)
  242. {
  243. exec_sync_stab_all(cur_op);
  244. }
  245. else if (cur_op->req.hdr.opcode == OSD_OP_SHOW_CONFIG)
  246. {
  247. exec_show_config(cur_op);
  248. }
  249. else if (cur_op->req.hdr.opcode == OSD_OP_READ)
  250. {
  251. continue_primary_read(cur_op);
  252. }
  253. else if (cur_op->req.hdr.opcode == OSD_OP_WRITE)
  254. {
  255. continue_primary_write(cur_op);
  256. }
  257. else if (cur_op->req.hdr.opcode == OSD_OP_SYNC)
  258. {
  259. continue_primary_sync(cur_op);
  260. }
  261. else if (cur_op->req.hdr.opcode == OSD_OP_DELETE)
  262. {
  263. continue_primary_del(cur_op);
  264. }
  265. else
  266. {
  267. exec_secondary(cur_op);
  268. }
  269. }
  270. void osd_t::reset_stats()
  271. {
  272. msgr.stats = { 0 };
  273. prev_stats = { 0 };
  274. memset(recovery_stat_count, 0, sizeof(recovery_stat_count));
  275. memset(recovery_stat_bytes, 0, sizeof(recovery_stat_bytes));
  276. }
  277. void osd_t::print_stats()
  278. {
  279. for (int i = OSD_OP_MIN; i <= OSD_OP_MAX; i++)
  280. {
  281. if (msgr.stats.op_stat_count[i] != prev_stats.op_stat_count[i] && i != OSD_OP_PING)
  282. {
  283. uint64_t avg = (msgr.stats.op_stat_sum[i] - prev_stats.op_stat_sum[i])/(msgr.stats.op_stat_count[i] - prev_stats.op_stat_count[i]);
  284. uint64_t bw = (msgr.stats.op_stat_bytes[i] - prev_stats.op_stat_bytes[i]) / print_stats_interval;
  285. if (msgr.stats.op_stat_bytes[i] != 0)
  286. {
  287. printf(
  288. "[OSD %lu] avg latency for op %d (%s): %lu us, B/W: %.2f %s\n", osd_num, i, osd_op_names[i], avg,
  289. (bw > 1024*1024*1024 ? bw/1024.0/1024/1024 : (bw > 1024*1024 ? bw/1024.0/1024 : bw/1024.0)),
  290. (bw > 1024*1024*1024 ? "GB/s" : (bw > 1024*1024 ? "MB/s" : "KB/s"))
  291. );
  292. }
  293. else
  294. {
  295. printf("[OSD %lu] avg latency for op %d (%s): %lu us\n", osd_num, i, osd_op_names[i], avg);
  296. }
  297. prev_stats.op_stat_count[i] = msgr.stats.op_stat_count[i];
  298. prev_stats.op_stat_sum[i] = msgr.stats.op_stat_sum[i];
  299. prev_stats.op_stat_bytes[i] = msgr.stats.op_stat_bytes[i];
  300. }
  301. }
  302. for (int i = OSD_OP_MIN; i <= OSD_OP_MAX; i++)
  303. {
  304. if (msgr.stats.subop_stat_count[i] != prev_stats.subop_stat_count[i])
  305. {
  306. uint64_t avg = (msgr.stats.subop_stat_sum[i] - prev_stats.subop_stat_sum[i])/(msgr.stats.subop_stat_count[i] - prev_stats.subop_stat_count[i]);
  307. printf("[OSD %lu] avg latency for subop %d (%s): %ld us\n", osd_num, i, osd_op_names[i], avg);
  308. prev_stats.subop_stat_count[i] = msgr.stats.subop_stat_count[i];
  309. prev_stats.subop_stat_sum[i] = msgr.stats.subop_stat_sum[i];
  310. }
  311. }
  312. for (int i = 0; i < 2; i++)
  313. {
  314. if (recovery_stat_count[0][i] != recovery_stat_count[1][i])
  315. {
  316. uint64_t bw = (recovery_stat_bytes[0][i] - recovery_stat_bytes[1][i]) / print_stats_interval;
  317. printf(
  318. "[OSD %lu] %s recovery: %.1f op/s, B/W: %.2f %s\n", osd_num, recovery_stat_names[i],
  319. (recovery_stat_count[0][i] - recovery_stat_count[1][i]) * 1.0 / print_stats_interval,
  320. (bw > 1024*1024*1024 ? bw/1024.0/1024/1024 : (bw > 1024*1024 ? bw/1024.0/1024 : bw/1024.0)),
  321. (bw > 1024*1024*1024 ? "GB/s" : (bw > 1024*1024 ? "MB/s" : "KB/s"))
  322. );
  323. recovery_stat_count[1][i] = recovery_stat_count[0][i];
  324. recovery_stat_bytes[1][i] = recovery_stat_bytes[0][i];
  325. }
  326. }
  327. if (incomplete_objects > 0)
  328. {
  329. printf("[OSD %lu] %lu object(s) incomplete\n", osd_num, incomplete_objects);
  330. }
  331. if (degraded_objects > 0)
  332. {
  333. printf("[OSD %lu] %lu object(s) degraded\n", osd_num, degraded_objects);
  334. }
  335. if (misplaced_objects > 0)
  336. {
  337. printf("[OSD %lu] %lu object(s) misplaced\n", osd_num, misplaced_objects);
  338. }
  339. }
  340. void osd_t::print_slow()
  341. {
  342. char alloc[1024];
  343. timespec now;
  344. clock_gettime(CLOCK_REALTIME, &now);
  345. for (auto & kv: msgr.clients)
  346. {
  347. for (auto op: kv.second->received_ops)
  348. {
  349. if ((now.tv_sec - op->tv_begin.tv_sec) >= slow_log_interval)
  350. {
  351. int l = sizeof(alloc), n;
  352. char *buf = alloc;
  353. #define bufprintf(s, ...) { n = snprintf(buf, l, s, __VA_ARGS__); n = n < 0 ? 0 : n; buf += n; l -= n; }
  354. bufprintf("[OSD %lu] Slow op", osd_num);
  355. if (kv.second->osd_num)
  356. {
  357. bufprintf(" from peer OSD %lu (client %d)", kv.second->osd_num, kv.second->peer_fd);
  358. }
  359. else
  360. {
  361. bufprintf(" from client %d", kv.second->peer_fd);
  362. }
  363. bufprintf(": %s id=%lu", osd_op_names[op->req.hdr.opcode], op->req.hdr.id);
  364. if (op->req.hdr.opcode == OSD_OP_SEC_READ || op->req.hdr.opcode == OSD_OP_SEC_WRITE ||
  365. op->req.hdr.opcode == OSD_OP_SEC_WRITE_STABLE || op->req.hdr.opcode == OSD_OP_SEC_DELETE)
  366. {
  367. bufprintf(" %lx:%lx v", op->req.sec_rw.oid.inode, op->req.sec_rw.oid.stripe);
  368. if (op->req.sec_rw.version == UINT64_MAX)
  369. {
  370. bufprintf("%s", "max");
  371. }
  372. else
  373. {
  374. bufprintf("%lu", op->req.sec_rw.version);
  375. }
  376. if (op->req.hdr.opcode != OSD_OP_SEC_DELETE)
  377. {
  378. bufprintf(" offset=%x len=%x", op->req.sec_rw.offset, op->req.sec_rw.len);
  379. }
  380. }
  381. else if (op->req.hdr.opcode == OSD_OP_SEC_STABILIZE || op->req.hdr.opcode == OSD_OP_SEC_ROLLBACK)
  382. {
  383. for (uint64_t i = 0; i < op->req.sec_stab.len; i += sizeof(obj_ver_id))
  384. {
  385. obj_ver_id *ov = (obj_ver_id*)(op->buf + i);
  386. bufprintf(i == 0 ? " %lx:%lx v%lu" : ", %lx:%lx v%lu", ov->oid.inode, ov->oid.stripe, ov->version);
  387. }
  388. }
  389. else if (op->req.hdr.opcode == OSD_OP_SEC_LIST)
  390. {
  391. bufprintf(
  392. " inode=%lx-%lx pg=%u/%u, stripe=%lu",
  393. op->req.sec_list.min_inode, op->req.sec_list.max_inode,
  394. op->req.sec_list.list_pg, op->req.sec_list.pg_count,
  395. op->req.sec_list.pg_stripe_size
  396. );
  397. }
  398. else if (op->req.hdr.opcode == OSD_OP_READ || op->req.hdr.opcode == OSD_OP_WRITE ||
  399. op->req.hdr.opcode == OSD_OP_DELETE)
  400. {
  401. bufprintf(" inode=%lx offset=%lx len=%x", op->req.rw.inode, op->req.rw.offset, op->req.rw.len);
  402. }
  403. #undef bufprintf
  404. printf("%s\n", alloc);
  405. }
  406. }
  407. }
  408. }