Simplified distributed block storage with strong consistency, like in Ceph
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

204 lines
5.3 KiB

  1. // Copyright (c) Vitaliy Filippov, 2019+
  2. // License: VNPL-1.1 (see README.md for details)
  3. #pragma once
  4. #include "crc32c.h"
  5. #define MIN_JOURNAL_SIZE 4*1024*1024
  6. #define JOURNAL_MAGIC 0x4A33
  7. #define JOURNAL_VERSION 1
  8. #define JOURNAL_BUFFER_SIZE 4*1024*1024
  9. // We reserve some extra space for future stabilize requests during writes
  10. // FIXME: This value should be dynamic i.e. Blockstore ideally shouldn't allow
  11. // writing more than can be stabilized afterwards
  12. #define JOURNAL_STABILIZE_RESERVATION 65536
  13. // Journal entries
  14. // Journal entries are linked to each other by their crc32 value
  15. // The journal is almost a blockchain, because object versions constantly increase
  16. #define JE_MIN 0x01
  17. #define JE_START 0x01
  18. #define JE_SMALL_WRITE 0x02
  19. #define JE_BIG_WRITE 0x03
  20. #define JE_STABLE 0x04
  21. #define JE_DELETE 0x05
  22. #define JE_ROLLBACK 0x06
  23. #define JE_SMALL_WRITE_INSTANT 0x07
  24. #define JE_BIG_WRITE_INSTANT 0x08
  25. #define JE_MAX 0x08
  26. // crc32c comes first to ease calculation and is equal to crc32()
  27. struct __attribute__((__packed__)) journal_entry_start
  28. {
  29. uint32_t crc32;
  30. uint16_t magic;
  31. uint16_t type;
  32. uint32_t size;
  33. uint32_t reserved;
  34. uint64_t journal_start;
  35. uint64_t version;
  36. };
  37. #define JE_START_LEGACY_SIZE 24
  38. struct __attribute__((__packed__)) journal_entry_small_write
  39. {
  40. uint32_t crc32;
  41. uint16_t magic;
  42. uint16_t type;
  43. uint32_t size;
  44. uint32_t crc32_prev;
  45. object_id oid;
  46. uint64_t version;
  47. uint32_t offset;
  48. uint32_t len;
  49. // small_write entries contain <len> bytes of data which is stored in next sectors
  50. // data_offset is its offset within journal
  51. uint64_t data_offset;
  52. uint32_t crc32_data;
  53. // small_write and big_write entries are followed by the "external" bitmap
  54. // its size is dynamic and included in journal entry's <size> field
  55. uint8_t bitmap[];
  56. };
  57. struct __attribute__((__packed__)) journal_entry_big_write
  58. {
  59. uint32_t crc32;
  60. uint16_t magic;
  61. uint16_t type;
  62. uint32_t size;
  63. uint32_t crc32_prev;
  64. object_id oid;
  65. uint64_t version;
  66. uint32_t offset;
  67. uint32_t len;
  68. uint64_t location;
  69. // small_write and big_write entries are followed by the "external" bitmap
  70. // its size is dynamic and included in journal entry's <size> field
  71. uint8_t bitmap[];
  72. };
  73. struct __attribute__((__packed__)) journal_entry_stable
  74. {
  75. uint32_t crc32;
  76. uint16_t magic;
  77. uint16_t type;
  78. uint32_t size;
  79. uint32_t crc32_prev;
  80. object_id oid;
  81. uint64_t version;
  82. };
  83. struct __attribute__((__packed__)) journal_entry_rollback
  84. {
  85. uint32_t crc32;
  86. uint16_t magic;
  87. uint16_t type;
  88. uint32_t size;
  89. uint32_t crc32_prev;
  90. object_id oid;
  91. uint64_t version;
  92. };
  93. struct __attribute__((__packed__)) journal_entry_del
  94. {
  95. uint32_t crc32;
  96. uint16_t magic;
  97. uint16_t type;
  98. uint32_t size;
  99. uint32_t crc32_prev;
  100. object_id oid;
  101. uint64_t version;
  102. };
  103. struct __attribute__((__packed__)) journal_entry
  104. {
  105. union
  106. {
  107. struct __attribute__((__packed__))
  108. {
  109. uint32_t crc32;
  110. uint16_t magic;
  111. uint16_t type;
  112. uint32_t size;
  113. uint32_t crc32_prev;
  114. };
  115. journal_entry_start start;
  116. journal_entry_small_write small_write;
  117. journal_entry_big_write big_write;
  118. journal_entry_stable stable;
  119. journal_entry_rollback rollback;
  120. journal_entry_del del;
  121. };
  122. };
  123. inline uint32_t je_crc32(journal_entry *je)
  124. {
  125. // 0x48674bc7 = crc32(4 zero bytes)
  126. return crc32c(0x48674bc7, ((uint8_t*)je)+4, je->size-4);
  127. }
  128. struct journal_sector_info_t
  129. {
  130. uint64_t offset;
  131. uint64_t flush_count;
  132. bool written;
  133. bool dirty;
  134. };
  135. struct journal_t
  136. {
  137. int fd;
  138. uint64_t device_size;
  139. bool inmemory = false;
  140. bool flush_journal = false;
  141. void *buffer = NULL;
  142. uint64_t block_size;
  143. uint64_t offset, len;
  144. // Next free block offset
  145. uint64_t next_free = 0;
  146. // First occupied block offset
  147. uint64_t used_start = 0;
  148. // End of the last block not used for writing anymore
  149. uint64_t dirty_start = 0;
  150. uint32_t crc32_last = 0;
  151. // Current sector(s) used for writing
  152. void *sector_buf = NULL;
  153. journal_sector_info_t *sector_info = NULL;
  154. uint64_t sector_count;
  155. bool no_same_sector_overwrites = false;
  156. int cur_sector = 0;
  157. int in_sector_pos = 0;
  158. // Used sector map
  159. // May use ~ 80 MB per 1 GB of used journal space in the worst case
  160. std::map<uint64_t, uint64_t> used_sectors;
  161. ~journal_t();
  162. bool trim();
  163. uint64_t get_trim_pos();
  164. inline bool entry_fits(int size)
  165. {
  166. return !(block_size - in_sector_pos < size ||
  167. no_same_sector_overwrites && sector_info[cur_sector].written);
  168. }
  169. };
  170. struct blockstore_journal_check_t
  171. {
  172. blockstore_impl_t *bs;
  173. uint64_t next_pos, next_sector, next_in_pos;
  174. int sectors_to_write, first_sector;
  175. bool right_dir; // writing to the end or the beginning of the ring buffer
  176. blockstore_journal_check_t(blockstore_impl_t *bs);
  177. int check_available(blockstore_op_t *op, int required, int size, int data_after);
  178. };
  179. journal_entry* prefill_single_journal_entry(journal_t & journal, uint16_t type, uint32_t size);
  180. void prepare_journal_sector_write(journal_t & journal, int sector, io_uring_sqe *sqe, std::function<void(ring_data_t*)> cb);