Simplified distributed block storage with strong consistency, like in Ceph
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

214 lines
7.8 KiB

  1. // Copyright (c) Vitaliy Filippov, 2019+
  2. // License: VNPL-1.1 (see README.md for details)
  3. #include "osd.h"
  4. #include "json11/json11.hpp"
  5. void osd_t::secondary_op_callback(osd_op_t *op)
  6. {
  7. if (op->req.hdr.opcode == OSD_OP_SEC_READ ||
  8. op->req.hdr.opcode == OSD_OP_SEC_WRITE ||
  9. op->req.hdr.opcode == OSD_OP_SEC_WRITE_STABLE)
  10. {
  11. op->reply.sec_rw.version = op->bs_op->version;
  12. }
  13. else if (op->req.hdr.opcode == OSD_OP_SEC_DELETE)
  14. {
  15. op->reply.sec_del.version = op->bs_op->version;
  16. }
  17. if (op->req.hdr.opcode == OSD_OP_SEC_READ)
  18. {
  19. if (op->bs_op->retval >= 0)
  20. op->reply.sec_rw.attr_len = clean_entry_bitmap_size;
  21. else
  22. op->reply.sec_rw.attr_len = 0;
  23. if (op->bs_op->retval > 0)
  24. op->iov.push_back(op->buf, op->bs_op->retval);
  25. }
  26. else if (op->req.hdr.opcode == OSD_OP_SEC_LIST)
  27. {
  28. // allocated by blockstore
  29. op->buf = op->bs_op->buf;
  30. if (op->bs_op->retval > 0)
  31. {
  32. op->iov.push_back(op->buf, op->bs_op->retval * sizeof(obj_ver_id));
  33. }
  34. op->reply.sec_list.stable_count = op->bs_op->version;
  35. }
  36. int retval = op->bs_op->retval;
  37. delete op->bs_op;
  38. op->bs_op = NULL;
  39. finish_op(op, retval);
  40. }
  41. void osd_t::exec_secondary(osd_op_t *cur_op)
  42. {
  43. if (cur_op->req.hdr.opcode == OSD_OP_SEC_READ_BMP)
  44. {
  45. int n = cur_op->req.sec_read_bmp.len / sizeof(obj_ver_id);
  46. if (n > 0)
  47. {
  48. obj_ver_id *ov = (obj_ver_id*)cur_op->buf;
  49. void *reply_buf = malloc_or_die(n * (8 + clean_entry_bitmap_size));
  50. void *cur_buf = reply_buf;
  51. for (int i = 0; i < n; i++)
  52. {
  53. bs->read_bitmap(ov[i].oid, ov[i].version, cur_buf + sizeof(uint64_t), (uint64_t*)cur_buf);
  54. cur_buf += (8 + clean_entry_bitmap_size);
  55. }
  56. free(cur_op->buf);
  57. cur_op->buf = reply_buf;
  58. }
  59. finish_op(cur_op, n * (8 + clean_entry_bitmap_size));
  60. return;
  61. }
  62. cur_op->bs_op = new blockstore_op_t();
  63. cur_op->bs_op->callback = [this, cur_op](blockstore_op_t* bs_op) { secondary_op_callback(cur_op); };
  64. cur_op->bs_op->opcode = (cur_op->req.hdr.opcode == OSD_OP_SEC_READ ? BS_OP_READ
  65. : (cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE ? BS_OP_WRITE
  66. : (cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE_STABLE ? BS_OP_WRITE_STABLE
  67. : (cur_op->req.hdr.opcode == OSD_OP_SEC_SYNC ? BS_OP_SYNC
  68. : (cur_op->req.hdr.opcode == OSD_OP_SEC_STABILIZE ? BS_OP_STABLE
  69. : (cur_op->req.hdr.opcode == OSD_OP_SEC_ROLLBACK ? BS_OP_ROLLBACK
  70. : (cur_op->req.hdr.opcode == OSD_OP_SEC_DELETE ? BS_OP_DELETE
  71. : (cur_op->req.hdr.opcode == OSD_OP_SEC_LIST ? BS_OP_LIST
  72. : -1))))))));
  73. if (cur_op->req.hdr.opcode == OSD_OP_SEC_READ ||
  74. cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE ||
  75. cur_op->req.hdr.opcode == OSD_OP_SEC_WRITE_STABLE)
  76. {
  77. if (cur_op->req.hdr.opcode == OSD_OP_SEC_READ)
  78. {
  79. // Allocate memory for the read operation
  80. if (clean_entry_bitmap_size > sizeof(unsigned))
  81. cur_op->bitmap = cur_op->rmw_buf = malloc_or_die(clean_entry_bitmap_size);
  82. else
  83. cur_op->bitmap = &cur_op->bmp_data;
  84. if (cur_op->req.sec_rw.len > 0)
  85. cur_op->buf = memalign_or_die(MEM_ALIGNMENT, cur_op->req.sec_rw.len);
  86. }
  87. cur_op->bs_op->oid = cur_op->req.sec_rw.oid;
  88. cur_op->bs_op->version = cur_op->req.sec_rw.version;
  89. cur_op->bs_op->offset = cur_op->req.sec_rw.offset;
  90. cur_op->bs_op->len = cur_op->req.sec_rw.len;
  91. cur_op->bs_op->buf = cur_op->buf;
  92. cur_op->bs_op->bitmap = cur_op->bitmap;
  93. #ifdef OSD_STUB
  94. cur_op->bs_op->retval = cur_op->bs_op->len;
  95. #endif
  96. }
  97. else if (cur_op->req.hdr.opcode == OSD_OP_SEC_DELETE)
  98. {
  99. cur_op->bs_op->oid = cur_op->req.sec_del.oid;
  100. cur_op->bs_op->version = cur_op->req.sec_del.version;
  101. #ifdef OSD_STUB
  102. cur_op->bs_op->retval = 0;
  103. #endif
  104. }
  105. else if (cur_op->req.hdr.opcode == OSD_OP_SEC_STABILIZE ||
  106. cur_op->req.hdr.opcode == OSD_OP_SEC_ROLLBACK)
  107. {
  108. cur_op->bs_op->len = cur_op->req.sec_stab.len/sizeof(obj_ver_id);
  109. cur_op->bs_op->buf = cur_op->buf;
  110. #ifdef OSD_STUB
  111. cur_op->bs_op->retval = 0;
  112. #endif
  113. }
  114. else if (cur_op->req.hdr.opcode == OSD_OP_SEC_LIST)
  115. {
  116. if (cur_op->req.sec_list.pg_count < cur_op->req.sec_list.list_pg)
  117. {
  118. // requested pg number is greater than total pg count
  119. printf("Invalid LIST request: pg count %u < pg number %u\n", cur_op->req.sec_list.pg_count, cur_op->req.sec_list.list_pg);
  120. cur_op->bs_op->retval = -EINVAL;
  121. secondary_op_callback(cur_op);
  122. return;
  123. }
  124. cur_op->bs_op->oid.stripe = cur_op->req.sec_list.pg_stripe_size;
  125. cur_op->bs_op->len = cur_op->req.sec_list.pg_count;
  126. cur_op->bs_op->offset = cur_op->req.sec_list.list_pg - 1;
  127. cur_op->bs_op->oid.inode = cur_op->req.sec_list.min_inode;
  128. cur_op->bs_op->version = cur_op->req.sec_list.max_inode;
  129. #ifdef OSD_STUB
  130. cur_op->bs_op->retval = 0;
  131. cur_op->bs_op->buf = NULL;
  132. #endif
  133. }
  134. #ifdef OSD_STUB
  135. secondary_op_callback(cur_op);
  136. #else
  137. bs->enqueue_op(cur_op->bs_op);
  138. #endif
  139. }
  140. void osd_t::exec_show_config(osd_op_t *cur_op)
  141. {
  142. std::string json_err;
  143. json11::Json req_json = cur_op->req.show_conf.json_len > 0
  144. ? json11::Json::parse(std::string((char *)cur_op->buf), json_err)
  145. : json11::Json();
  146. // Expose sensitive configuration values so peers can check them
  147. json11::Json::object wire_config = json11::Json::object {
  148. { "osd_num", osd_num },
  149. { "protocol_version", OSD_PROTOCOL_VERSION },
  150. { "block_size", (uint64_t)bs_block_size },
  151. { "bitmap_granularity", (uint64_t)bs_bitmap_granularity },
  152. { "primary_enabled", run_primary },
  153. { "blockstore_enabled", bs ? true : false },
  154. { "readonly", readonly },
  155. { "immediate_commit", (immediate_commit == IMMEDIATE_ALL ? "all" :
  156. (immediate_commit == IMMEDIATE_SMALL ? "small" : "none")) },
  157. { "lease_timeout", etcd_report_interval+(MAX_ETCD_ATTEMPTS*(2*ETCD_QUICK_TIMEOUT)+999)/1000 },
  158. };
  159. #ifdef WITH_RDMA
  160. if (msgr.is_rdma_enabled())
  161. {
  162. // Indicate that RDMA is enabled
  163. wire_config["rdma_enabled"] = true;
  164. if (req_json["connect_rdma"].is_string())
  165. {
  166. // Peer is trying to connect using RDMA, try to satisfy him
  167. bool ok = msgr.connect_rdma(cur_op->peer_fd, req_json["connect_rdma"].string_value(), req_json["rdma_max_msg"].uint64_value());
  168. if (ok)
  169. {
  170. auto rc = msgr.clients.at(cur_op->peer_fd)->rdma_conn;
  171. wire_config["rdma_address"] = rc->addr.to_string();
  172. wire_config["rdma_max_msg"] = rc->max_msg;
  173. }
  174. }
  175. }
  176. #endif
  177. if (cur_op->buf)
  178. free(cur_op->buf);
  179. std::string cfg_str = json11::Json(wire_config).dump();
  180. cur_op->buf = malloc_or_die(cfg_str.size()+1);
  181. memcpy(cur_op->buf, cfg_str.c_str(), cfg_str.size()+1);
  182. cur_op->iov.push_back(cur_op->buf, cfg_str.size()+1);
  183. finish_op(cur_op, cfg_str.size()+1);
  184. }
  185. void osd_t::exec_sync_stab_all(osd_op_t *cur_op)
  186. {
  187. // Sync and stabilize all objects
  188. // This command is only valid for tests
  189. cur_op->bs_op = new blockstore_op_t();
  190. if (!allow_test_ops)
  191. {
  192. cur_op->bs_op->retval = -EINVAL;
  193. secondary_op_callback(cur_op);
  194. return;
  195. }
  196. cur_op->bs_op->opcode = BS_OP_SYNC_STAB_ALL;
  197. cur_op->bs_op->callback = [this, cur_op](blockstore_op_t *bs_op)
  198. {
  199. secondary_op_callback(cur_op);
  200. };
  201. #ifdef OSD_STUB
  202. cur_op->bs_op->retval = 0;
  203. secondary_op_callback(cur_op);
  204. #else
  205. bs->enqueue_op(cur_op->bs_op);
  206. #endif
  207. }