Simplified distributed block storage with strong consistency, like in Ceph
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

145 lines
4.8 KiB

1 year ago
1 year ago
  1. #include "cluster_client.h"
  2. void cluster_client_t::outbox_push(osd_op_t *cur_op)
  3. {
  4. assert(cur_op->peer_fd);
  5. auto & cl = clients.at(cur_op->peer_fd);
  6. if (cur_op->op_type == OSD_OP_OUT)
  7. {
  8. clock_gettime(CLOCK_REALTIME, &cur_op->tv_begin);
  9. }
  10. cl.outbox.push_back(cur_op);
  11. if (cl.write_op || cl.outbox.size() > 1 || !try_send(cl))
  12. {
  13. if (cl.write_state == 0)
  14. {
  15. cl.write_state = CL_WRITE_READY;
  16. write_ready_clients.push_back(cur_op->peer_fd);
  17. }
  18. ringloop->wakeup();
  19. }
  20. else
  21. ringloop->submit();
  22. }
  23. bool cluster_client_t::try_send(osd_client_t & cl)
  24. {
  25. int peer_fd = cl.peer_fd;
  26. {
  27. timespec now;
  28. clock_gettime(CLOCK_REALTIME, &now);
  29. printf("get_sqe %s %d %ld.%06ld\n", __FILE__, __LINE__, now.tv_sec, now.tv_nsec/1000);
  30. }
  31. io_uring_sqe* sqe = ringloop->get_sqe();
  32. if (!sqe)
  33. {
  34. return false;
  35. }
  36. ring_data_t* data = ((ring_data_t*)sqe->user_data);
  37. if (!cl.write_op)
  38. {
  39. // pick next command
  40. cl.write_op = cl.outbox.front();
  41. cl.outbox.pop_front();
  42. cl.write_state = CL_WRITE_REPLY;
  43. if (cl.write_op->op_type == OSD_OP_IN)
  44. {
  45. // Measure execution latency
  46. timespec tv_end;
  47. clock_gettime(CLOCK_REALTIME, &tv_end);
  48. stats.op_stat_count[cl.write_op->req.hdr.opcode]++;
  49. if (!stats.op_stat_count[cl.write_op->req.hdr.opcode])
  50. {
  51. stats.op_stat_count[cl.write_op->req.hdr.opcode]++;
  52. stats.op_stat_sum[cl.write_op->req.hdr.opcode] = 0;
  53. stats.op_stat_bytes[cl.write_op->req.hdr.opcode] = 0;
  54. }
  55. stats.op_stat_sum[cl.write_op->req.hdr.opcode] += (
  56. (tv_end.tv_sec - cl.write_op->tv_begin.tv_sec)*1000000 +
  57. (tv_end.tv_nsec - cl.write_op->tv_begin.tv_nsec)/1000
  58. );
  59. if (cl.write_op->req.hdr.opcode == OSD_OP_READ ||
  60. cl.write_op->req.hdr.opcode == OSD_OP_WRITE)
  61. {
  62. stats.op_stat_bytes[cl.write_op->req.hdr.opcode] += cl.write_op->req.rw.len;
  63. }
  64. else if (cl.write_op->req.hdr.opcode == OSD_OP_SECONDARY_READ ||
  65. cl.write_op->req.hdr.opcode == OSD_OP_SECONDARY_WRITE)
  66. {
  67. stats.op_stat_bytes[cl.write_op->req.hdr.opcode] += cl.write_op->req.sec_rw.len;
  68. }
  69. }
  70. }
  71. cl.write_msg.msg_iov = cl.write_op->send_list.get_iovec();
  72. cl.write_msg.msg_iovlen = cl.write_op->send_list.get_size();
  73. data->callback = [this, peer_fd](ring_data_t *data) { handle_send(data, peer_fd); };
  74. my_uring_prep_sendmsg(sqe, peer_fd, &cl.write_msg, 0);
  75. return true;
  76. }
  77. void cluster_client_t::send_replies()
  78. {
  79. for (int i = 0; i < write_ready_clients.size(); i++)
  80. {
  81. int peer_fd = write_ready_clients[i];
  82. if (!try_send(clients[peer_fd]))
  83. {
  84. write_ready_clients.erase(write_ready_clients.begin(), write_ready_clients.begin() + i);
  85. return;
  86. }
  87. }
  88. write_ready_clients.clear();
  89. }
  90. void cluster_client_t::handle_send(ring_data_t *data, int peer_fd)
  91. {
  92. auto cl_it = clients.find(peer_fd);
  93. if (cl_it != clients.end())
  94. {
  95. auto & cl = cl_it->second;
  96. if (data->res < 0 && data->res != -EAGAIN)
  97. {
  98. // this is a client socket, so don't panic. just disconnect it
  99. printf("Client %d socket write error: %d (%s). Disconnecting client\n", peer_fd, -data->res, strerror(-data->res));
  100. stop_client(peer_fd);
  101. return;
  102. }
  103. if (data->res >= 0)
  104. {
  105. osd_op_t *cur_op = cl.write_op;
  106. while (data->res > 0 && cur_op->send_list.sent < cur_op->send_list.count)
  107. {
  108. iovec & iov = cur_op->send_list.buf[cur_op->send_list.sent];
  109. if (iov.iov_len <= data->res)
  110. {
  111. data->res -= iov.iov_len;
  112. cur_op->send_list.sent++;
  113. }
  114. else
  115. {
  116. iov.iov_len -= data->res;
  117. iov.iov_base += data->res;
  118. break;
  119. }
  120. }
  121. if (cur_op->send_list.sent >= cur_op->send_list.count)
  122. {
  123. // Done
  124. if (cur_op->op_type == OSD_OP_IN)
  125. {
  126. delete cur_op;
  127. }
  128. else
  129. {
  130. cl.sent_ops[cl.write_op->req.hdr.id] = cl.write_op;
  131. }
  132. cl.write_op = NULL;
  133. cl.write_state = cl.outbox.size() > 0 ? CL_WRITE_READY : 0;
  134. }
  135. }
  136. if (cl.write_state != 0)
  137. {
  138. write_ready_clients.push_back(peer_fd);
  139. }
  140. }
  141. }