Simplified distributed block storage with strong consistency, like in Ceph
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

406 lines
11 KiB

2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
  1. #include <sys/socket.h>
  2. #include <sys/epoll.h>
  3. #include <sys/poll.h>
  4. #include <netinet/in.h>
  5. #include <netinet/tcp.h>
  6. #include <arpa/inet.h>
  7. #include "osd.h"
  8. static const char* osd_op_names[] = {
  9. "",
  10. "read",
  11. "write",
  12. "sync",
  13. "stabilize",
  14. "rollback",
  15. "delete",
  16. "sync_stab_all",
  17. "list",
  18. "show_config",
  19. "primary_read",
  20. "primary_write",
  21. "primary_sync",
  22. };
  23. osd_t::osd_t(blockstore_config_t & config, blockstore_t *bs, ring_loop_t *ringloop)
  24. {
  25. this->config = config;
  26. this->bs = bs;
  27. this->ringloop = ringloop;
  28. this->tick_tfd = new timerfd_interval(ringloop, 3, [this]()
  29. {
  30. for (int i = 0; i <= OSD_OP_MAX; i++)
  31. {
  32. if (op_stat_count[i] != 0)
  33. {
  34. printf("avg latency for op %d (%s): %ld us\n", i, osd_op_names[i], op_stat_sum[i]/op_stat_count[i]);
  35. op_stat_count[i] = 0;
  36. op_stat_sum[i] = 0;
  37. }
  38. }
  39. for (int i = 0; i <= OSD_OP_MAX; i++)
  40. {
  41. if (subop_stat_count[i] != 0)
  42. {
  43. printf("avg latency for subop %d (%s): %ld us\n", i, osd_op_names[i], subop_stat_sum[i]/subop_stat_count[i]);
  44. subop_stat_count[i] = 0;
  45. subop_stat_sum[i] = 0;
  46. }
  47. }
  48. if (send_stat_count != 0)
  49. {
  50. printf("avg latency to send stabilize subop: %ld us\n", send_stat_sum/send_stat_count);
  51. send_stat_count = 0;
  52. send_stat_sum = 0;
  53. }
  54. });
  55. this->bs_block_size = bs->get_block_size();
  56. // FIXME: use bitmap granularity instead
  57. this->bs_disk_alignment = bs->get_disk_alignment();
  58. bind_address = config["bind_address"];
  59. if (bind_address == "")
  60. bind_address = "0.0.0.0";
  61. bind_port = strtoull(config["bind_port"].c_str(), NULL, 10);
  62. if (!bind_port || bind_port > 65535)
  63. bind_port = 11203;
  64. osd_num = strtoull(config["osd_num"].c_str(), NULL, 10);
  65. if (!osd_num)
  66. throw std::runtime_error("osd_num is required in the configuration");
  67. run_primary = config["run_primary"] == "true" || config["run_primary"] == "1" || config["run_primary"] == "yes";
  68. if (run_primary)
  69. init_primary();
  70. listen_fd = socket(AF_INET, SOCK_STREAM, 0);
  71. if (listen_fd < 0)
  72. {
  73. throw std::runtime_error(std::string("socket: ") + strerror(errno));
  74. }
  75. int enable = 1;
  76. setsockopt(listen_fd, SOL_SOCKET, SO_REUSEADDR, &enable, sizeof(enable));
  77. sockaddr_in addr;
  78. int r;
  79. if ((r = inet_pton(AF_INET, bind_address.c_str(), &addr.sin_addr)) != 1)
  80. {
  81. close(listen_fd);
  82. throw std::runtime_error("bind address "+bind_address+(r == 0 ? " is not valid" : ": no ipv4 support"));
  83. }
  84. addr.sin_family = AF_INET;
  85. addr.sin_port = htons(bind_port);
  86. if (bind(listen_fd, (sockaddr*)&addr, sizeof(addr)) < 0)
  87. {
  88. close(listen_fd);
  89. throw std::runtime_error(std::string("bind: ") + strerror(errno));
  90. }
  91. if (listen(listen_fd, listen_backlog) < 0)
  92. {
  93. close(listen_fd);
  94. throw std::runtime_error(std::string("listen: ") + strerror(errno));
  95. }
  96. fcntl(listen_fd, F_SETFL, fcntl(listen_fd, F_GETFL, 0) | O_NONBLOCK);
  97. epoll_fd = epoll_create(1);
  98. if (epoll_fd < 0)
  99. {
  100. close(listen_fd);
  101. throw std::runtime_error(std::string("epoll_create: ") + strerror(errno));
  102. }
  103. epoll_fd_index = ringloop->register_fd(epoll_fd);
  104. epoll_event ev;
  105. ev.data.fd = listen_fd;
  106. ev.events = EPOLLIN | EPOLLET;
  107. if (epoll_ctl(epoll_fd, EPOLL_CTL_ADD, listen_fd, &ev) < 0)
  108. {
  109. close(listen_fd);
  110. close(epoll_fd);
  111. throw std::runtime_error(std::string("epoll_ctl: ") + strerror(errno));
  112. }
  113. consumer.loop = [this]() { loop(); };
  114. ringloop->register_consumer(consumer);
  115. }
  116. osd_t::~osd_t()
  117. {
  118. delete tick_tfd;
  119. ringloop->unregister_consumer(consumer);
  120. close(epoll_fd);
  121. close(listen_fd);
  122. }
  123. osd_op_t::~osd_op_t()
  124. {
  125. if (bs_op)
  126. {
  127. delete bs_op;
  128. }
  129. if (op_data)
  130. {
  131. free(op_data);
  132. }
  133. if (rmw_buf)
  134. {
  135. free(rmw_buf);
  136. }
  137. if (buf)
  138. {
  139. // Note: reusing osd_op_t WILL currently lead to memory leaks
  140. // So we don't reuse it, but free it every time
  141. free(buf);
  142. }
  143. }
  144. bool osd_t::shutdown()
  145. {
  146. stopping = true;
  147. if (inflight_ops > 0)
  148. {
  149. return false;
  150. }
  151. return bs->is_safe_to_stop();
  152. }
  153. void osd_t::loop()
  154. {
  155. if (!wait_state)
  156. {
  157. handle_epoll_events();
  158. wait_state = 1;
  159. }
  160. handle_peers();
  161. read_requests();
  162. send_replies();
  163. ringloop->submit();
  164. }
  165. void osd_t::handle_epoll_events()
  166. {
  167. io_uring_sqe *sqe = ringloop->get_sqe();
  168. if (!sqe)
  169. {
  170. throw std::runtime_error("can't get SQE, will fall out of sync with EPOLLET");
  171. }
  172. ring_data_t *data = ((ring_data_t*)sqe->user_data);
  173. data->allow_cancel = true;
  174. my_uring_prep_poll_add(sqe, epoll_fd_index, POLLIN);
  175. sqe->flags |= IOSQE_FIXED_FILE;
  176. data->callback = [this](ring_data_t *data)
  177. {
  178. if (data->res < 0 && data->res != -ECANCELED)
  179. {
  180. throw std::runtime_error(std::string("epoll failed: ") + strerror(-data->res));
  181. }
  182. handle_epoll_events();
  183. };
  184. ringloop->submit();
  185. // FIXME With SQ thread we have no guarantee that epoll request will be submitted right here...
  186. int nfds;
  187. epoll_event events[MAX_EPOLL_EVENTS];
  188. restart:
  189. nfds = epoll_wait(epoll_fd, events, MAX_EPOLL_EVENTS, 0);
  190. for (int i = 0; i < nfds; i++)
  191. {
  192. if (events[i].data.fd == listen_fd)
  193. {
  194. // Accept new connections
  195. sockaddr_in addr;
  196. socklen_t peer_addr_size = sizeof(addr);
  197. int peer_fd;
  198. while ((peer_fd = accept(listen_fd, (sockaddr*)&addr, &peer_addr_size)) >= 0)
  199. {
  200. char peer_str[256];
  201. printf("osd: new client %d: connection from %s port %d\n", peer_fd, inet_ntop(AF_INET, &addr.sin_addr, peer_str, 256), ntohs(addr.sin_port));
  202. fcntl(peer_fd, F_SETFL, fcntl(listen_fd, F_GETFL, 0) | O_NONBLOCK);
  203. int one = 1;
  204. setsockopt(peer_fd, SOL_TCP, TCP_NODELAY, &one, sizeof(one));
  205. clients[peer_fd] = {
  206. .peer_addr = addr,
  207. .peer_port = ntohs(addr.sin_port),
  208. .peer_fd = peer_fd,
  209. .peer_fd_index = ringloop->register_fd(peer_fd),
  210. .peer_state = PEER_CONNECTED,
  211. };
  212. // Add FD to epoll
  213. epoll_event ev;
  214. ev.data.fd = peer_fd;
  215. ev.events = EPOLLIN | EPOLLRDHUP | EPOLLET;
  216. if (epoll_ctl(epoll_fd, EPOLL_CTL_ADD, peer_fd, &ev) < 0)
  217. {
  218. throw std::runtime_error(std::string("epoll_ctl: ") + strerror(errno));
  219. }
  220. // Try to accept next connection
  221. peer_addr_size = sizeof(addr);
  222. }
  223. if (peer_fd == -1 && errno != EAGAIN)
  224. {
  225. throw std::runtime_error(std::string("accept: ") + strerror(errno));
  226. }
  227. }
  228. else
  229. {
  230. auto & cl = clients[events[i].data.fd];
  231. if (cl.peer_state == PEER_CONNECTING)
  232. {
  233. // Either OUT (connected) or HUP
  234. handle_connect_result(cl.peer_fd);
  235. }
  236. else if (events[i].events & EPOLLRDHUP)
  237. {
  238. // Stop client
  239. printf("osd: client %d disconnected\n", cl.peer_fd);
  240. stop_client(cl.peer_fd);
  241. }
  242. else
  243. {
  244. // Mark client as ready (i.e. some data is available)
  245. cl.read_ready++;
  246. if (cl.read_ready == 1)
  247. {
  248. read_ready_clients.push_back(cl.peer_fd);
  249. ringloop->wakeup();
  250. }
  251. }
  252. }
  253. }
  254. if (nfds > 0)
  255. {
  256. goto restart;
  257. }
  258. }
  259. void osd_t::cancel_osd_ops(osd_client_t & cl)
  260. {
  261. for (auto p: cl.sent_ops)
  262. {
  263. cancel_op(p.second);
  264. }
  265. cl.sent_ops.clear();
  266. for (auto op: cl.outbox)
  267. {
  268. cancel_op(op);
  269. }
  270. cl.outbox.clear();
  271. if (cl.write_op)
  272. {
  273. cancel_op(cl.write_op);
  274. cl.write_op = NULL;
  275. }
  276. }
  277. void osd_t::cancel_op(osd_op_t *op)
  278. {
  279. if (op->op_type == OSD_OP_OUT)
  280. {
  281. op->reply.hdr.magic = SECONDARY_OSD_REPLY_MAGIC;
  282. op->reply.hdr.id = op->req.hdr.id;
  283. op->reply.hdr.opcode = op->req.hdr.opcode;
  284. op->reply.hdr.retval = -EPIPE;
  285. op->callback(op);
  286. }
  287. else
  288. {
  289. delete op;
  290. }
  291. }
  292. void osd_t::stop_client(int peer_fd)
  293. {
  294. auto it = clients.find(peer_fd);
  295. if (it == clients.end())
  296. {
  297. return;
  298. }
  299. auto & cl = it->second;
  300. if (epoll_ctl(epoll_fd, EPOLL_CTL_DEL, peer_fd, NULL) < 0)
  301. {
  302. throw std::runtime_error(std::string("epoll_ctl: ") + strerror(errno));
  303. }
  304. if (cl.osd_num)
  305. {
  306. // Cancel outbound operations
  307. cancel_osd_ops(cl);
  308. osd_peer_fds.erase(cl.osd_num);
  309. repeer_pgs(cl.osd_num, false);
  310. peering_state |= OSD_PEERING_PEERS;
  311. }
  312. if (cl.read_op)
  313. {
  314. delete cl.read_op;
  315. }
  316. for (auto rit = read_ready_clients.begin(); rit != read_ready_clients.end(); rit++)
  317. {
  318. if (*rit == peer_fd)
  319. {
  320. read_ready_clients.erase(rit);
  321. break;
  322. }
  323. }
  324. for (auto wit = write_ready_clients.begin(); wit != write_ready_clients.end(); wit++)
  325. {
  326. if (*wit == peer_fd)
  327. {
  328. write_ready_clients.erase(wit);
  329. break;
  330. }
  331. }
  332. clients.erase(it);
  333. close(peer_fd);
  334. }
  335. void osd_t::exec_op(osd_op_t *cur_op)
  336. {
  337. clock_gettime(CLOCK_REALTIME, &cur_op->tv_begin);
  338. if (stopping)
  339. {
  340. // Throw operation away
  341. delete cur_op;
  342. return;
  343. }
  344. cur_op->send_list.push_back(cur_op->reply.buf, OSD_PACKET_SIZE);
  345. if (cur_op->req.hdr.magic != SECONDARY_OSD_OP_MAGIC ||
  346. cur_op->req.hdr.opcode < OSD_OP_MIN || cur_op->req.hdr.opcode > OSD_OP_MAX ||
  347. (cur_op->req.hdr.opcode == OSD_OP_SECONDARY_READ || cur_op->req.hdr.opcode == OSD_OP_SECONDARY_WRITE) &&
  348. (cur_op->req.sec_rw.len > OSD_RW_MAX || cur_op->req.sec_rw.len % OSD_RW_ALIGN || cur_op->req.sec_rw.offset % OSD_RW_ALIGN) ||
  349. (cur_op->req.hdr.opcode == OSD_OP_READ || cur_op->req.hdr.opcode == OSD_OP_WRITE) &&
  350. (cur_op->req.rw.len > OSD_RW_MAX || cur_op->req.rw.len % OSD_RW_ALIGN || cur_op->req.rw.offset % OSD_RW_ALIGN))
  351. {
  352. // Bad command
  353. cur_op->reply.hdr.magic = SECONDARY_OSD_REPLY_MAGIC;
  354. cur_op->reply.hdr.id = cur_op->req.hdr.id;
  355. cur_op->reply.hdr.opcode = cur_op->req.hdr.opcode;
  356. cur_op->reply.hdr.retval = -EINVAL;
  357. outbox_push(this->clients[cur_op->peer_fd], cur_op);
  358. return;
  359. }
  360. inflight_ops++;
  361. if (cur_op->req.hdr.opcode == OSD_OP_TEST_SYNC_STAB_ALL)
  362. {
  363. exec_sync_stab_all(cur_op);
  364. }
  365. else if (cur_op->req.hdr.opcode == OSD_OP_SHOW_CONFIG)
  366. {
  367. exec_show_config(cur_op);
  368. }
  369. else if (cur_op->req.hdr.opcode == OSD_OP_READ)
  370. {
  371. continue_primary_read(cur_op);
  372. }
  373. else if (cur_op->req.hdr.opcode == OSD_OP_WRITE)
  374. {
  375. continue_primary_write(cur_op);
  376. }
  377. else if (cur_op->req.hdr.opcode == OSD_OP_SYNC)
  378. {
  379. continue_primary_sync(cur_op);
  380. }
  381. else
  382. {
  383. exec_secondary(cur_op);
  384. }
  385. }