Simplified distributed block storage with strong consistency, like in Ceph
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

160 lines
4.6 KiB

  1. // Copyright (c) Vitaliy Filippov, 2019+
  2. // License: VNPL-1.0 (see README.md for details)
  3. #include <map>
  4. #include <vector>
  5. #include <algorithm>
  6. #include "cpp-btree/btree_map.h"
  7. #include "object_id.h"
  8. #include "osd_ops.h"
  9. #include "pg_states.h"
  10. #define PG_EPOCH_BITS 48
  11. struct pg_obj_loc_t
  12. {
  13. uint64_t role;
  14. osd_num_t osd_num;
  15. bool outdated;
  16. };
  17. typedef std::vector<pg_obj_loc_t> pg_osd_set_t;
  18. struct pg_osd_set_state_t
  19. {
  20. // (role -> osd_num_t) map, as in pg.target_set and pg.cur_set
  21. std::vector<osd_num_t> read_target;
  22. // full OSD set including additional OSDs where the object is misplaced
  23. pg_osd_set_t osd_set;
  24. uint64_t state = 0;
  25. uint64_t object_count = 0;
  26. };
  27. struct pg_list_result_t
  28. {
  29. obj_ver_id *buf = NULL;
  30. uint64_t total_count;
  31. uint64_t stable_count;
  32. };
  33. struct osd_op_t;
  34. struct pg_peering_state_t
  35. {
  36. // osd_num -> list result
  37. std::map<osd_num_t, osd_op_t*> list_ops;
  38. std::map<osd_num_t, pg_list_result_t> list_results;
  39. pool_id_t pool_id = 0;
  40. pg_num_t pg_num = 0;
  41. };
  42. struct obj_piece_id_t
  43. {
  44. object_id oid;
  45. uint64_t osd_num;
  46. };
  47. struct flush_action_t
  48. {
  49. bool rollback = false, make_stable = false;
  50. uint64_t stable_to = 0, rollback_to = 0;
  51. bool submitted = false;
  52. };
  53. struct pg_flush_batch_t
  54. {
  55. std::map<osd_num_t, std::vector<obj_ver_id>> rollback_lists;
  56. std::map<osd_num_t, std::vector<obj_ver_id>> stable_lists;
  57. int flush_ops = 0, flush_done = 0;
  58. int flush_objects = 0;
  59. };
  60. struct pg_t
  61. {
  62. int state = 0;
  63. uint64_t scheme = 0;
  64. uint64_t pg_cursize = 0, pg_size = 0, pg_minsize = 0, parity_chunks = 0;
  65. pool_id_t pool_id = 0;
  66. pg_num_t pg_num = 0;
  67. uint64_t clean_count = 0, total_count = 0;
  68. // epoch number - should increase with each non-clean activation of the PG
  69. uint64_t epoch = 0, reported_epoch = 0;
  70. // target history and all potential peers
  71. std::vector<std::vector<osd_num_t>> target_history;
  72. std::vector<osd_num_t> all_peers;
  73. bool history_changed = false;
  74. // peer list from the last peering event
  75. std::vector<osd_num_t> cur_peers;
  76. // target_set is the "correct" peer OSD set for this PG
  77. std::vector<osd_num_t> target_set;
  78. // cur_set is the current set of connected peer OSDs for this PG
  79. // cur_set = (role => osd_num or UINT64_MAX if missing). role numbers begin with zero
  80. std::vector<osd_num_t> cur_set;
  81. // same thing in state_dict-like format
  82. pg_osd_set_t cur_loc_set;
  83. // moved object map. by default, each object is considered to reside on cur_set.
  84. // this map stores all objects that differ.
  85. // it may consume up to ~ (raw storage / object size) * 24 bytes in the worst case scenario
  86. // which is up to ~192 MB per 1 TB in the worst case scenario
  87. std::map<pg_osd_set_t, pg_osd_set_state_t> state_dict;
  88. btree::btree_map<object_id, pg_osd_set_state_t*> incomplete_objects, misplaced_objects, degraded_objects;
  89. std::map<obj_piece_id_t, flush_action_t> flush_actions;
  90. btree::btree_map<object_id, uint64_t> ver_override;
  91. pg_peering_state_t *peering_state = NULL;
  92. pg_flush_batch_t *flush_batch = NULL;
  93. int inflight = 0; // including write_queue
  94. std::multimap<object_id, osd_op_t*> write_queue;
  95. void calc_object_states(int log_level);
  96. void print_state();
  97. };
  98. inline bool operator < (const pg_obj_loc_t &a, const pg_obj_loc_t &b)
  99. {
  100. return a.outdated < b.outdated ||
  101. a.outdated == b.outdated && a.role < b.role ||
  102. a.outdated == b.outdated && a.role == b.role && a.osd_num < b.osd_num;
  103. }
  104. inline bool operator == (const obj_piece_id_t & a, const obj_piece_id_t & b)
  105. {
  106. return a.oid == b.oid && a.osd_num == b.osd_num;
  107. }
  108. inline bool operator < (const obj_piece_id_t & a, const obj_piece_id_t & b)
  109. {
  110. return a.oid < b.oid || a.oid == b.oid && a.osd_num < b.osd_num;
  111. }
  112. namespace std
  113. {
  114. template<> struct hash<pg_osd_set_t>
  115. {
  116. inline size_t operator()(const pg_osd_set_t &s) const
  117. {
  118. size_t seed = 0;
  119. for (auto e: s)
  120. {
  121. // Copy-pasted from spp::hash_combine()
  122. seed ^= (e.role + 0xc6a4a7935bd1e995 + (seed << 6) + (seed >> 2));
  123. seed ^= (e.osd_num + 0xc6a4a7935bd1e995 + (seed << 6) + (seed >> 2));
  124. }
  125. return seed;
  126. }
  127. };
  128. template<> struct hash<obj_piece_id_t>
  129. {
  130. inline size_t operator()(const obj_piece_id_t &s) const
  131. {
  132. size_t seed = std::hash<object_id>()(s.oid);
  133. // Copy-pasted from spp::hash_combine()
  134. seed ^= (s.osd_num + 0xc6a4a7935bd1e995 + (seed << 6) + (seed >> 2));
  135. return seed;
  136. }
  137. };
  138. }