Simplified distributed block storage with strong consistency, like in Ceph
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

275 lines
6.9 KiB

  1. #pragma once
  2. #include <sys/types.h>
  3. #include <sys/time.h>
  4. #include <sys/ioctl.h>
  5. #include <sys/stat.h>
  6. #include <fcntl.h>
  7. #include <unistd.h>
  8. #include <malloc.h>
  9. #include <arpa/inet.h>
  10. #include <malloc.h>
  11. #include <set>
  12. #include <deque>
  13. #include "blockstore.h"
  14. #include "ringloop.h"
  15. #include "timerfd_interval.h"
  16. #include "osd_ops.h"
  17. #include "osd_peering_pg.h"
  18. #include "sparsepp/sparsepp/spp.h"
  19. #define OSD_OP_IN 0
  20. #define OSD_OP_OUT 1
  21. #define CL_READ_OP 1
  22. #define CL_READ_DATA 2
  23. #define CL_READ_REPLY_DATA 3
  24. #define CL_WRITE_READY 1
  25. #define CL_WRITE_REPLY 2
  26. #define MAX_EPOLL_EVENTS 64
  27. #define OSD_OP_INLINE_BUF_COUNT 16
  28. #define PEER_CONNECTING 1
  29. #define PEER_CONNECTED 2
  30. #define OSD_PEERING_PEERS 1
  31. #define OSD_PEERING_PGS 2
  32. //#define OSD_STUB
  33. struct osd_op_buf_list_t
  34. {
  35. int count = 0, alloc = 0, sent = 0;
  36. iovec *buf = NULL;
  37. iovec inline_buf[OSD_OP_INLINE_BUF_COUNT];
  38. ~osd_op_buf_list_t()
  39. {
  40. if (buf && buf != inline_buf)
  41. {
  42. free(buf);
  43. }
  44. }
  45. inline iovec* get_iovec()
  46. {
  47. return (buf ? buf : inline_buf) + sent;
  48. }
  49. inline int get_size()
  50. {
  51. return count - sent;
  52. }
  53. inline void push_back(void *nbuf, size_t len)
  54. {
  55. if (count >= alloc)
  56. {
  57. if (!alloc)
  58. {
  59. alloc = OSD_OP_INLINE_BUF_COUNT;
  60. buf = inline_buf;
  61. }
  62. else if (buf == inline_buf)
  63. {
  64. int old = alloc;
  65. alloc = ((alloc/16)*16 + 1);
  66. buf = (iovec*)malloc(sizeof(iovec) * alloc);
  67. memcpy(buf, inline_buf, sizeof(iovec)*old);
  68. }
  69. else
  70. {
  71. alloc = ((alloc/16)*16 + 1);
  72. buf = (iovec*)realloc(buf, sizeof(iovec) * alloc);
  73. }
  74. }
  75. buf[count++] = { .iov_base = nbuf, .iov_len = len };
  76. }
  77. };
  78. struct osd_primary_op_data_t;
  79. struct osd_op_t
  80. {
  81. timeval tv_begin;
  82. timeval tv_send;
  83. int op_type = OSD_OP_IN;
  84. int peer_fd;
  85. osd_any_op_t req;
  86. osd_any_reply_t reply;
  87. blockstore_op_t *bs_op = NULL;
  88. void *buf = NULL;
  89. void *rmw_buf = NULL;
  90. osd_primary_op_data_t* op_data = NULL;
  91. std::function<void(osd_op_t*)> callback;
  92. osd_op_buf_list_t send_list;
  93. ~osd_op_t();
  94. };
  95. struct osd_peer_def_t
  96. {
  97. osd_num_t osd_num = 0;
  98. std::string addr;
  99. int port = 0;
  100. time_t last_connect_attempt = 0;
  101. };
  102. struct osd_client_t
  103. {
  104. sockaddr_in peer_addr;
  105. int peer_port;
  106. int peer_fd;
  107. int peer_state;
  108. std::function<void(osd_num_t, int)> connect_callback;
  109. osd_num_t osd_num = 0;
  110. // Read state
  111. int read_ready = 0;
  112. osd_op_t *read_op = NULL;
  113. int read_reply_id = 0;
  114. iovec read_iov = { 0 };
  115. msghdr read_msg = { 0 };
  116. void *read_buf = NULL;
  117. int read_remaining = 0;
  118. int read_state = 0;
  119. // Outbound operations sent to this client (which is probably an OSD peer)
  120. std::map<int, osd_op_t*> sent_ops;
  121. // Outbound messages (replies or requests)
  122. std::deque<osd_op_t*> outbox;
  123. // PGs dirtied by this client's primary-writes
  124. std::set<pg_num_t> dirty_pgs;
  125. // Write state
  126. osd_op_t *write_op = NULL;
  127. msghdr write_msg;
  128. int write_state = 0;
  129. };
  130. struct osd_rmw_stripe_t;
  131. struct osd_object_id_t
  132. {
  133. osd_num_t osd_num;
  134. object_id oid;
  135. };
  136. class osd_t
  137. {
  138. // config
  139. osd_num_t osd_num = 1; // OSD numbers start with 1
  140. bool run_primary = false;
  141. std::vector<osd_peer_def_t> peers;
  142. blockstore_config_t config;
  143. std::string bind_address;
  144. int bind_port, listen_backlog;
  145. int client_queue_depth = 128;
  146. bool allow_test_ops = true;
  147. // peer OSDs
  148. std::map<uint64_t, int> osd_peer_fds;
  149. std::vector<pg_t> pgs;
  150. int peering_state = 0;
  151. unsigned pg_count = 0;
  152. uint64_t next_subop_id = 1;
  153. // Unstable writes
  154. std::map<osd_object_id_t, uint64_t> unstable_writes;
  155. std::deque<osd_op_t*> syncs_in_progress;
  156. // client & peer I/O
  157. bool stopping = false;
  158. int inflight_ops = 0;
  159. blockstore_t *bs;
  160. uint32_t bs_block_size, bs_disk_alignment;
  161. uint64_t parity_block_size = 4*1024*1024; // 4 MB by default
  162. ring_loop_t *ringloop;
  163. timerfd_interval *tick_tfd;
  164. int wait_state = 0;
  165. int epoll_fd = 0;
  166. int listen_fd = 0;
  167. ring_consumer_t consumer;
  168. std::unordered_map<int,osd_client_t> clients;
  169. std::vector<int> read_ready_clients;
  170. std::vector<int> write_ready_clients;
  171. uint64_t op_stat_sum[OSD_OP_MAX+1] = { 0 };
  172. uint64_t op_stat_count[OSD_OP_MAX+1] = { 0 };
  173. uint64_t subop_stat_sum[OSD_OP_MAX+1] = { 0 };
  174. uint64_t subop_stat_count[OSD_OP_MAX+1] = { 0 };
  175. uint64_t send_stat_sum = 0;
  176. uint64_t send_stat_count = 0;
  177. // methods
  178. // event loop, socket read/write
  179. void loop();
  180. void handle_epoll_events();
  181. bool try_receive(osd_client_t & cl);
  182. void read_requests();
  183. void handle_read(ring_data_t *data, int peer_fd);
  184. void handle_op_hdr(osd_client_t *cl);
  185. void handle_reply_hdr(osd_client_t *cl);
  186. void send_replies();
  187. void handle_send(ring_data_t *data, int peer_fd);
  188. void outbox_push(osd_client_t & cl, osd_op_t *op);
  189. // peer handling (primary OSD logic)
  190. void connect_peer(osd_num_t osd_num, const char *peer_host, int peer_port, std::function<void(osd_num_t, int)> callback);
  191. void handle_connect_result(int peer_fd);
  192. void cancel_osd_ops(osd_client_t & cl);
  193. void cancel_op(osd_op_t *op);
  194. void stop_client(int peer_fd);
  195. osd_peer_def_t parse_peer(std::string peer);
  196. void init_primary();
  197. void handle_peers();
  198. void repeer_pgs(osd_num_t osd_num, bool is_connected);
  199. void start_pg_peering(int i);
  200. // op execution
  201. void exec_op(osd_op_t *cur_op);
  202. // secondary ops
  203. void exec_sync_stab_all(osd_op_t *cur_op);
  204. void exec_show_config(osd_op_t *cur_op);
  205. void exec_secondary(osd_op_t *cur_op);
  206. void secondary_op_callback(osd_op_t *cur_op);
  207. // primary ops
  208. bool prepare_primary_rw(osd_op_t *cur_op);
  209. void continue_primary_read(osd_op_t *cur_op);
  210. void continue_primary_write(osd_op_t *cur_op);
  211. void continue_primary_sync(osd_op_t *cur_op);
  212. void finish_primary_op(osd_op_t *cur_op, int retval);
  213. void handle_primary_subop(osd_op_t *cur_op, int ok, uint64_t version);
  214. void submit_primary_subops(int submit_type, int read_pg_size, const uint64_t* osd_set, osd_op_t *cur_op);
  215. void submit_primary_sync_subops(osd_op_t *cur_op);
  216. void submit_primary_stab_subops(osd_op_t *cur_op);
  217. public:
  218. osd_t(blockstore_config_t & config, blockstore_t *bs, ring_loop_t *ringloop);
  219. ~osd_t();
  220. bool shutdown();
  221. };
  222. inline bool operator == (const osd_object_id_t & a, const osd_object_id_t & b)
  223. {
  224. return a.osd_num == b.osd_num && a.oid.inode == b.oid.inode && a.oid.stripe == b.oid.stripe;
  225. }
  226. inline bool operator < (const osd_object_id_t & a, const osd_object_id_t & b)
  227. {
  228. return a.osd_num < b.osd_num || a.osd_num == b.osd_num && (
  229. a.oid.inode < b.oid.inode || a.oid.inode == b.oid.inode && a.oid.stripe < b.oid.stripe
  230. );
  231. }