Simplified distributed block storage with strong consistency, like in Ceph
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

407 lines
13 KiB

  1. #include <netinet/tcp.h>
  2. #include <sys/epoll.h>
  3. #include <algorithm>
  4. #include "osd.h"
  5. void osd_t::init_primary()
  6. {
  7. // Initial test version of clustering code requires exactly 2 peers
  8. // FIXME Hardcode
  9. std::string peerstr = config["peers"];
  10. while (peerstr.size())
  11. {
  12. int pos = peerstr.find(',');
  13. peers.push_back(parse_peer(pos < 0 ? peerstr : peerstr.substr(0, pos)));
  14. peerstr = pos < 0 ? std::string("") : peerstr.substr(pos+1);
  15. for (int i = 0; i < peers.size()-1; i++)
  16. if (peers[i].osd_num == peers[peers.size()-1].osd_num)
  17. throw std::runtime_error("same osd number "+std::to_string(peers[i].osd_num)+" specified twice in peers");
  18. }
  19. if (peers.size() < 2)
  20. throw std::runtime_error("run_primary requires at least 2 peers");
  21. pgs.push_back((pg_t){
  22. .state = PG_OFFLINE,
  23. .pg_cursize = 0,
  24. .pg_num = 1,
  25. .target_set = { 1, 2, 3 },
  26. .cur_set = { 1, 0, 0 },
  27. });
  28. pg_count = 1;
  29. peering_state = OSD_PEERING_PEERS;
  30. }
  31. osd_peer_def_t osd_t::parse_peer(std::string peer)
  32. {
  33. // OSD_NUM:IP:PORT
  34. int pos1 = peer.find(':');
  35. int pos2 = peer.find(':', pos1+1);
  36. if (pos1 < 0 || pos2 < 0)
  37. throw new std::runtime_error("OSD peer string must be in the form OSD_NUM:IP:PORT");
  38. osd_peer_def_t r;
  39. r.addr = peer.substr(pos1+1, pos2-pos1-1);
  40. std::string osd_num_str = peer.substr(0, pos1);
  41. std::string port_str = peer.substr(pos2+1);
  42. r.osd_num = strtoull(osd_num_str.c_str(), NULL, 10);
  43. if (!r.osd_num)
  44. throw new std::runtime_error("Could not parse OSD peer osd_num");
  45. r.port = strtoull(port_str.c_str(), NULL, 10);
  46. if (!r.port)
  47. throw new std::runtime_error("Could not parse OSD peer port");
  48. return r;
  49. }
  50. void osd_t::connect_peer(osd_num_t osd_num, const char *peer_host, int peer_port, std::function<void(osd_num_t, int)> callback)
  51. {
  52. struct sockaddr_in addr;
  53. int r;
  54. if ((r = inet_pton(AF_INET, peer_host, &addr.sin_addr)) != 1)
  55. {
  56. callback(osd_num, -EINVAL);
  57. return;
  58. }
  59. addr.sin_family = AF_INET;
  60. addr.sin_port = htons(peer_port ? peer_port : 11203);
  61. int peer_fd = socket(AF_INET, SOCK_STREAM, 0);
  62. if (peer_fd < 0)
  63. {
  64. callback(osd_num, -errno);
  65. return;
  66. }
  67. fcntl(peer_fd, F_SETFL, fcntl(peer_fd, F_GETFL, 0) | O_NONBLOCK);
  68. r = connect(peer_fd, (sockaddr*)&addr, sizeof(addr));
  69. if (r < 0 && errno != EINPROGRESS)
  70. {
  71. close(peer_fd);
  72. callback(osd_num, -errno);
  73. return;
  74. }
  75. clients[peer_fd] = (osd_client_t){
  76. .peer_addr = addr,
  77. .peer_port = peer_port,
  78. .peer_fd = peer_fd,
  79. .peer_state = PEER_CONNECTING,
  80. .connect_callback = callback,
  81. .osd_num = osd_num,
  82. };
  83. osd_peer_fds[osd_num] = peer_fd;
  84. // Add FD to epoll (EPOLLOUT for tracking connect() result)
  85. epoll_event ev;
  86. ev.data.fd = peer_fd;
  87. ev.events = EPOLLOUT | EPOLLIN | EPOLLRDHUP | EPOLLET;
  88. if (epoll_ctl(epoll_fd, EPOLL_CTL_ADD, peer_fd, &ev) < 0)
  89. {
  90. throw std::runtime_error(std::string("epoll_ctl: ") + strerror(errno));
  91. }
  92. }
  93. void osd_t::handle_connect_result(int peer_fd)
  94. {
  95. auto & cl = clients[peer_fd];
  96. osd_num_t osd_num = cl.osd_num;
  97. auto callback = cl.connect_callback;
  98. int result = 0;
  99. socklen_t result_len = sizeof(result);
  100. if (getsockopt(peer_fd, SOL_SOCKET, SO_ERROR, &result, &result_len) < 0)
  101. {
  102. result = errno;
  103. }
  104. if (result != 0)
  105. {
  106. stop_client(peer_fd);
  107. callback(osd_num, -result);
  108. return;
  109. }
  110. int one = 1;
  111. setsockopt(peer_fd, SOL_TCP, TCP_NODELAY, &one, sizeof(one));
  112. // Disable EPOLLOUT on this fd
  113. cl.peer_fd_index = ringloop->register_fd(peer_fd);
  114. cl.connect_callback = NULL;
  115. cl.peer_state = PEER_CONNECTED;
  116. epoll_event ev;
  117. ev.data.fd = peer_fd;
  118. ev.events = EPOLLIN | EPOLLRDHUP | EPOLLET;
  119. if (epoll_ctl(epoll_fd, EPOLL_CTL_MOD, peer_fd, &ev) < 0)
  120. {
  121. throw std::runtime_error(std::string("epoll_ctl: ") + strerror(errno));
  122. }
  123. callback(osd_num, peer_fd);
  124. }
  125. // Peering loop
  126. void osd_t::handle_peers()
  127. {
  128. if (peering_state & OSD_PEERING_PEERS)
  129. {
  130. for (int i = 0; i < peers.size(); i++)
  131. {
  132. if (osd_peer_fds.find(peers[i].osd_num) == osd_peer_fds.end() &&
  133. time(NULL) - peers[i].last_connect_attempt > 5) // FIXME hardcode 5
  134. {
  135. peers[i].last_connect_attempt = time(NULL);
  136. connect_peer(peers[i].osd_num, peers[i].addr.c_str(), peers[i].port, [this](osd_num_t osd_num, int peer_fd)
  137. {
  138. // FIXME: Check peer config after connecting
  139. if (peer_fd < 0)
  140. {
  141. printf("Failed to connect to peer OSD %lu: %s\n", osd_num, strerror(-peer_fd));
  142. return;
  143. }
  144. printf("Connected with peer OSD %lu (fd %d)\n", clients[peer_fd].osd_num, peer_fd);
  145. int i;
  146. for (i = 0; i < peers.size(); i++)
  147. {
  148. if (osd_peer_fds.find(peers[i].osd_num) == osd_peer_fds.end())
  149. break;
  150. }
  151. if (i >= peers.size())
  152. {
  153. // Connected to all peers
  154. peering_state = peering_state & ~OSD_PEERING_PEERS;
  155. }
  156. repeer_pgs(osd_num, true);
  157. });
  158. }
  159. }
  160. }
  161. if (peering_state & OSD_PEERING_PGS)
  162. {
  163. bool still_doing_pgs = false;
  164. for (int i = 0; i < pgs.size(); i++)
  165. {
  166. if (pgs[i].state == PG_PEERING)
  167. {
  168. if (!pgs[i].peering_state->list_ops.size())
  169. {
  170. pgs[i].calc_object_states();
  171. }
  172. else
  173. {
  174. still_doing_pgs = true;
  175. }
  176. }
  177. }
  178. if (!still_doing_pgs)
  179. {
  180. // Done all PGs
  181. peering_state = peering_state & ~OSD_PEERING_PGS;
  182. }
  183. }
  184. }
  185. void osd_t::repeer_pgs(osd_num_t osd_num, bool is_connected)
  186. {
  187. // Re-peer affected PGs
  188. // FIXME: We shouldn't rely just on target_set. Other OSDs may also contain PG data.
  189. osd_num_t real_osd = (is_connected ? osd_num : 0);
  190. for (int i = 0; i < pgs.size(); i++)
  191. {
  192. bool repeer = false;
  193. for (int r = 0; r < pgs[i].target_set.size(); r++)
  194. {
  195. if (pgs[i].target_set[r] == osd_num &&
  196. pgs[i].cur_set[r] != real_osd)
  197. {
  198. pgs[i].cur_set[r] = real_osd;
  199. repeer = true;
  200. break;
  201. }
  202. }
  203. if (repeer)
  204. {
  205. // Repeer this pg
  206. printf("Repeer PG %d because of OSD %lu\n", i, osd_num);
  207. start_pg_peering(i);
  208. peering_state |= OSD_PEERING_PGS;
  209. }
  210. }
  211. }
  212. // Repeer on each connect/disconnect peer event
  213. void osd_t::start_pg_peering(int pg_idx)
  214. {
  215. auto & pg = pgs[pg_idx];
  216. pg.state = PG_PEERING;
  217. pg.state_dict.clear();
  218. pg.obj_states.clear();
  219. pg.ver_override.clear();
  220. pg.pg_cursize = 0;
  221. for (int role = 0; role < pg.cur_set.size(); role++)
  222. {
  223. if (pg.cur_set[role] != 0)
  224. {
  225. pg.pg_cursize++;
  226. }
  227. }
  228. if (pg.pg_cursize < pg.pg_minsize)
  229. {
  230. pg.state = PG_INCOMPLETE;
  231. }
  232. if (pg.peering_state)
  233. {
  234. // Adjust the peering operation that's still in progress
  235. for (auto it = pg.peering_state->list_ops.begin(); it != pg.peering_state->list_ops.end(); it++)
  236. {
  237. int role;
  238. for (role = 0; role < pg.cur_set.size(); role++)
  239. {
  240. if (pg.cur_set[role] == it->first)
  241. break;
  242. }
  243. if (pg.state == PG_INCOMPLETE || role >= pg.cur_set.size())
  244. {
  245. // Discard the result after completion, which, chances are, will be unsuccessful
  246. auto list_op = it->second;
  247. if (list_op->peer_fd == 0)
  248. {
  249. // Self
  250. list_op->bs_op->callback = [list_op](blockstore_op_t *bs_op)
  251. {
  252. if (list_op->bs_op->buf)
  253. free(list_op->bs_op->buf);
  254. delete list_op;
  255. };
  256. }
  257. else
  258. {
  259. // Peer
  260. list_op->callback = [](osd_op_t *list_op)
  261. {
  262. delete list_op;
  263. };
  264. }
  265. pg.peering_state->list_ops.erase(it);
  266. it = pg.peering_state->list_ops.begin();
  267. }
  268. }
  269. for (auto it = pg.peering_state->list_results.begin(); it != pg.peering_state->list_results.end(); it++)
  270. {
  271. int role;
  272. for (role = 0; role < pg.cur_set.size(); role++)
  273. {
  274. if (pg.cur_set[role] == it->first)
  275. break;
  276. }
  277. if (pg.state == PG_INCOMPLETE || role >= pg.cur_set.size())
  278. {
  279. if (it->second.buf)
  280. {
  281. free(it->second.buf);
  282. }
  283. pg.peering_state->list_results.erase(it);
  284. it = pg.peering_state->list_results.begin();
  285. }
  286. }
  287. }
  288. if (pg.state == PG_INCOMPLETE)
  289. {
  290. if (pg.peering_state)
  291. {
  292. delete pg.peering_state;
  293. pg.peering_state = NULL;
  294. }
  295. printf("PG %d is incomplete\n", pg.pg_num);
  296. return;
  297. }
  298. if (!pg.peering_state)
  299. {
  300. pg.peering_state = new pg_peering_state_t();
  301. }
  302. auto ps = pg.peering_state;
  303. for (int role = 0; role < pg.cur_set.size(); role++)
  304. {
  305. osd_num_t role_osd = pg.cur_set[role];
  306. if (!role_osd)
  307. {
  308. continue;
  309. }
  310. if (ps->list_ops.find(role_osd) != ps->list_ops.end() ||
  311. ps->list_results.find(role_osd) != ps->list_results.end())
  312. {
  313. continue;
  314. }
  315. if (role_osd == this->osd_num)
  316. {
  317. // Self
  318. osd_op_t *op = new osd_op_t();
  319. op->op_type = 0;
  320. op->peer_fd = 0;
  321. op->bs_op = new blockstore_op_t();
  322. op->bs_op->opcode = BS_OP_LIST;
  323. op->bs_op->oid.stripe = parity_block_size;
  324. op->bs_op->len = pg_count,
  325. op->bs_op->offset = pg.pg_num-1,
  326. op->bs_op->callback = [ps, op, role_osd](blockstore_op_t *bs_op)
  327. {
  328. if (op->bs_op->retval < 0)
  329. {
  330. throw std::runtime_error("local OP_LIST failed");
  331. }
  332. printf(
  333. "Got object list from OSD %lu (local): %d object versions (%lu of them stable)\n",
  334. role_osd, bs_op->retval, bs_op->version
  335. );
  336. ps->list_results[role_osd] = {
  337. .buf = (obj_ver_id*)op->bs_op->buf,
  338. .total_count = (uint64_t)op->bs_op->retval,
  339. .stable_count = op->bs_op->version,
  340. };
  341. ps->list_done++;
  342. ps->list_ops.erase(role_osd);
  343. delete op;
  344. };
  345. bs->enqueue_op(op->bs_op);
  346. ps->list_ops[role_osd] = op;
  347. }
  348. else
  349. {
  350. // Peer
  351. auto & cl = clients[osd_peer_fds[role_osd]];
  352. osd_op_t *op = new osd_op_t();
  353. op->op_type = OSD_OP_OUT;
  354. op->send_list.push_back(op->req.buf, OSD_PACKET_SIZE);
  355. op->peer_fd = cl.peer_fd;
  356. op->req = {
  357. .sec_list = {
  358. .header = {
  359. .magic = SECONDARY_OSD_OP_MAGIC,
  360. .id = this->next_subop_id++,
  361. .opcode = OSD_OP_SECONDARY_LIST,
  362. },
  363. .list_pg = pg.pg_num,
  364. .pg_count = pg_count,
  365. .parity_block_size = parity_block_size,
  366. },
  367. };
  368. op->callback = [this, ps, role_osd](osd_op_t *op)
  369. {
  370. if (op->reply.hdr.retval < 0)
  371. {
  372. printf("Failed to get object list from OSD %lu (retval=%ld), disconnecting peer\n", role_osd, op->reply.hdr.retval);
  373. ps->list_ops.erase(role_osd);
  374. stop_client(op->peer_fd);
  375. delete op;
  376. return;
  377. }
  378. printf(
  379. "Got object list from OSD %lu: %ld object versions (%lu of them stable)\n",
  380. role_osd, op->reply.hdr.retval, op->reply.sec_list.stable_count
  381. );
  382. ps->list_results[role_osd] = {
  383. .buf = (obj_ver_id*)op->buf,
  384. .total_count = (uint64_t)op->reply.hdr.retval,
  385. .stable_count = op->reply.sec_list.stable_count,
  386. };
  387. // set op->buf to NULL so it doesn't get freed
  388. op->buf = NULL;
  389. ps->list_done++;
  390. ps->list_ops.erase(role_osd);
  391. delete op;
  392. };
  393. outbox_push(cl, op);
  394. ps->list_ops[role_osd] = op;
  395. }
  396. }
  397. ringloop->wakeup();
  398. }