Simplified distributed block storage with strong consistency, like in Ceph
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

208 lines
6.6 KiB

  1. #include "osd.h"
  2. bool osd_t::try_receive(osd_client_t & cl)
  3. {
  4. int peer_fd = cl.peer_fd;
  5. io_uring_sqe* sqe = ringloop->get_sqe();
  6. if (!sqe)
  7. {
  8. return false;
  9. }
  10. ring_data_t* data = ((ring_data_t*)sqe->user_data);
  11. if (!cl.read_buf)
  12. {
  13. // no reads in progress
  14. // so this is either a new command or a reply to a previously sent command
  15. if (!cl.read_op)
  16. {
  17. cl.read_op = new osd_op_t;
  18. cl.read_op->peer_fd = peer_fd;
  19. }
  20. cl.read_op->op_type = OSD_OP_IN;
  21. cl.read_buf = &cl.read_op->req.buf;
  22. cl.read_remaining = OSD_PACKET_SIZE;
  23. cl.read_state = CL_READ_OP;
  24. }
  25. cl.read_iov.iov_base = cl.read_buf;
  26. cl.read_iov.iov_len = cl.read_remaining;
  27. cl.read_msg.msg_iov = &cl.read_iov;
  28. cl.read_msg.msg_iovlen = 1;
  29. data->callback = [this, peer_fd](ring_data_t *data) { handle_read(data, peer_fd); };
  30. my_uring_prep_recvmsg(sqe, peer_fd, &cl.read_msg, 0);
  31. return true;
  32. }
  33. void osd_t::read_requests()
  34. {
  35. for (auto & p: clients)
  36. {
  37. if (p.second.peer_state == PEER_CONNECTED && p.second.read_iov.iov_len == 0)
  38. {
  39. try_receive(p.second);
  40. }
  41. }
  42. }
  43. void osd_t::handle_read(ring_data_t *data, int peer_fd)
  44. {
  45. auto cl_it = clients.find(peer_fd);
  46. if (cl_it != clients.end())
  47. {
  48. auto & cl = cl_it->second;
  49. cl.read_iov.iov_len = 0;
  50. if (data->res == -EAGAIN)
  51. {
  52. return;
  53. }
  54. else if (data->res < 0)
  55. {
  56. // this is a client socket, so don't panic. just disconnect it
  57. printf("Client %d socket read error: %d (%s). Disconnecting client\n", peer_fd, -data->res, strerror(-data->res));
  58. stop_client(peer_fd);
  59. return;
  60. }
  61. if (data->res > 0)
  62. {
  63. cl.read_remaining -= data->res;
  64. cl.read_buf += data->res;
  65. if (cl.read_remaining <= 0)
  66. {
  67. cl.read_buf = NULL;
  68. if (cl.read_state == CL_READ_OP)
  69. {
  70. if (cl.read_op->req.hdr.magic == SECONDARY_OSD_REPLY_MAGIC)
  71. {
  72. handle_reply_hdr(&cl);
  73. }
  74. else
  75. {
  76. handle_op_hdr(&cl);
  77. }
  78. }
  79. else if (cl.read_state == CL_READ_DATA)
  80. {
  81. // Operation is ready
  82. exec_op(cl.read_op);
  83. cl.read_op = NULL;
  84. cl.read_state = 0;
  85. }
  86. else if (cl.read_state == CL_READ_REPLY_DATA)
  87. {
  88. // Reply is ready
  89. auto req_it = cl.sent_ops.find(cl.read_reply_id);
  90. osd_op_t *request = req_it->second;
  91. cl.sent_ops.erase(req_it);
  92. cl.read_reply_id = 0;
  93. cl.read_state = 0;
  94. // Measure subop latency
  95. timeval tv_end;
  96. gettimeofday(&tv_end, NULL);
  97. subop_stat_count[request->req.hdr.opcode]++;
  98. subop_stat_sum[request->req.hdr.opcode] += (
  99. (tv_end.tv_sec - request->tv_begin.tv_sec)*1000000 +
  100. tv_end.tv_usec - request->tv_begin.tv_usec
  101. );
  102. request->callback(request);
  103. }
  104. }
  105. }
  106. }
  107. }
  108. void osd_t::handle_op_hdr(osd_client_t *cl)
  109. {
  110. osd_op_t *cur_op = cl->read_op;
  111. if (cur_op->req.hdr.opcode == OSD_OP_SECONDARY_READ)
  112. {
  113. if (cur_op->req.sec_rw.len > 0)
  114. cur_op->buf = memalign(512, cur_op->req.sec_rw.len);
  115. cl->read_remaining = 0;
  116. }
  117. else if (cur_op->req.hdr.opcode == OSD_OP_SECONDARY_WRITE)
  118. {
  119. if (cur_op->req.sec_rw.len > 0)
  120. cur_op->buf = memalign(512, cur_op->req.sec_rw.len);
  121. cl->read_remaining = cur_op->req.sec_rw.len;
  122. }
  123. else if (cur_op->req.hdr.opcode == OSD_OP_SECONDARY_STABILIZE ||
  124. cur_op->req.hdr.opcode == OSD_OP_SECONDARY_ROLLBACK)
  125. {
  126. if (cur_op->req.sec_stab.len > 0)
  127. cur_op->buf = memalign(512, cur_op->req.sec_stab.len);
  128. cl->read_remaining = cur_op->req.sec_stab.len;
  129. }
  130. else if (cur_op->req.hdr.opcode == OSD_OP_READ)
  131. {
  132. if (cur_op->req.rw.len > 0)
  133. cur_op->buf = memalign(512, cur_op->req.rw.len);
  134. cl->read_remaining = 0;
  135. }
  136. else if (cur_op->req.hdr.opcode == OSD_OP_WRITE)
  137. {
  138. if (cur_op->req.rw.len > 0)
  139. cur_op->buf = memalign(512, cur_op->req.rw.len);
  140. cl->read_remaining = cur_op->req.rw.len;
  141. }
  142. if (cl->read_remaining > 0)
  143. {
  144. // Read data
  145. cl->read_buf = cur_op->buf;
  146. cl->read_state = CL_READ_DATA;
  147. }
  148. else
  149. {
  150. // Operation is ready
  151. cl->read_op = NULL;
  152. cl->read_state = 0;
  153. exec_op(cur_op);
  154. }
  155. }
  156. void osd_t::handle_reply_hdr(osd_client_t *cl)
  157. {
  158. osd_op_t *cur_op = cl->read_op;
  159. auto req_it = cl->sent_ops.find(cur_op->req.hdr.id);
  160. if (req_it == cl->sent_ops.end())
  161. {
  162. // Command out of sync. Drop connection
  163. printf("Client %d command out of sync: id %lu\n", cl->peer_fd, cur_op->req.hdr.id);
  164. stop_client(cl->peer_fd);
  165. return;
  166. }
  167. osd_op_t *op = req_it->second;
  168. memcpy(op->reply.buf, cur_op->req.buf, OSD_PACKET_SIZE);
  169. if (op->reply.hdr.opcode == OSD_OP_SECONDARY_READ &&
  170. op->reply.hdr.retval > 0)
  171. {
  172. // Read data. In this case we assume that the buffer is preallocated by the caller (!)
  173. assert(op->buf);
  174. cl->read_state = CL_READ_REPLY_DATA;
  175. cl->read_reply_id = op->req.hdr.id;
  176. cl->read_buf = op->buf;
  177. cl->read_remaining = op->reply.hdr.retval;
  178. }
  179. else if (op->reply.hdr.opcode == OSD_OP_SECONDARY_LIST &&
  180. op->reply.hdr.retval > 0)
  181. {
  182. op->buf = memalign(512, sizeof(obj_ver_id) * op->reply.hdr.retval);
  183. cl->read_state = CL_READ_REPLY_DATA;
  184. cl->read_reply_id = op->req.hdr.id;
  185. cl->read_buf = op->buf;
  186. cl->read_remaining = sizeof(obj_ver_id) * op->reply.hdr.retval;
  187. }
  188. else
  189. {
  190. cl->read_state = 0;
  191. cl->sent_ops.erase(req_it);
  192. // Measure subop latency
  193. timeval tv_end;
  194. gettimeofday(&tv_end, NULL);
  195. subop_stat_count[op->req.hdr.opcode]++;
  196. subop_stat_sum[op->req.hdr.opcode] += (
  197. (tv_end.tv_sec - op->tv_begin.tv_sec)*1000000 +
  198. tv_end.tv_usec - op->tv_begin.tv_usec
  199. );
  200. op->callback(op);
  201. }
  202. }