Simplified distributed block storage with strong consistency, like in Ceph
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

195 lines
5.0 KiB

  1. // Copyright (c) Vitaliy Filippov, 2019+
  2. // License: VNPL-1.0 (see README.md for details)
  3. #pragma once
  4. #include "crc32c.h"
  5. #define MIN_JOURNAL_SIZE 4*1024*1024
  6. #define JOURNAL_MAGIC 0x4A33
  7. #define JOURNAL_BUFFER_SIZE 4*1024*1024
  8. // We reserve some extra space for future stabilize requests during writes
  9. // FIXME: This value should be dynamic i.e. Blockstore ideally shouldn't allow
  10. // writing more than can be stabilized afterwards
  11. #define JOURNAL_STABILIZE_RESERVATION 65536
  12. // Journal entries
  13. // Journal entries are linked to each other by their crc32 value
  14. // The journal is almost a blockchain, because object versions constantly increase
  15. #define JE_MIN 0x01
  16. #define JE_START 0x01
  17. #define JE_SMALL_WRITE 0x02
  18. #define JE_BIG_WRITE 0x03
  19. #define JE_STABLE 0x04
  20. #define JE_DELETE 0x05
  21. #define JE_ROLLBACK 0x06
  22. #define JE_SMALL_WRITE_INSTANT 0x07
  23. #define JE_BIG_WRITE_INSTANT 0x08
  24. #define JE_MAX 0x08
  25. // crc32c comes first to ease calculation and is equal to crc32()
  26. struct __attribute__((__packed__)) journal_entry_start
  27. {
  28. uint32_t crc32;
  29. uint16_t magic;
  30. uint16_t type;
  31. uint32_t size;
  32. uint32_t reserved;
  33. uint64_t journal_start;
  34. };
  35. struct __attribute__((__packed__)) journal_entry_small_write
  36. {
  37. uint32_t crc32;
  38. uint16_t magic;
  39. uint16_t type;
  40. uint32_t size;
  41. uint32_t crc32_prev;
  42. object_id oid;
  43. uint64_t version;
  44. uint32_t offset;
  45. uint32_t len;
  46. // small_write entries contain <len> bytes of data which is stored in next sectors
  47. // data_offset is its offset within journal
  48. uint64_t data_offset;
  49. uint32_t crc32_data;
  50. // small_write and big_write entries are followed by the "external" bitmap
  51. // its size is dynamic and included in journal entry's <size> field
  52. uint8_t bitmap[];
  53. };
  54. struct __attribute__((__packed__)) journal_entry_big_write
  55. {
  56. uint32_t crc32;
  57. uint16_t magic;
  58. uint16_t type;
  59. uint32_t size;
  60. uint32_t crc32_prev;
  61. object_id oid;
  62. uint64_t version;
  63. uint32_t offset;
  64. uint32_t len;
  65. uint64_t location;
  66. // small_write and big_write entries are followed by the "external" bitmap
  67. // its size is dynamic and included in journal entry's <size> field
  68. uint8_t bitmap[];
  69. };
  70. struct __attribute__((__packed__)) journal_entry_stable
  71. {
  72. uint32_t crc32;
  73. uint16_t magic;
  74. uint16_t type;
  75. uint32_t size;
  76. uint32_t crc32_prev;
  77. object_id oid;
  78. uint64_t version;
  79. };
  80. struct __attribute__((__packed__)) journal_entry_rollback
  81. {
  82. uint32_t crc32;
  83. uint16_t magic;
  84. uint16_t type;
  85. uint32_t size;
  86. uint32_t crc32_prev;
  87. object_id oid;
  88. uint64_t version;
  89. };
  90. struct __attribute__((__packed__)) journal_entry_del
  91. {
  92. uint32_t crc32;
  93. uint16_t magic;
  94. uint16_t type;
  95. uint32_t size;
  96. uint32_t crc32_prev;
  97. object_id oid;
  98. uint64_t version;
  99. };
  100. struct __attribute__((__packed__)) journal_entry
  101. {
  102. union
  103. {
  104. struct __attribute__((__packed__))
  105. {
  106. uint32_t crc32;
  107. uint16_t magic;
  108. uint16_t type;
  109. uint32_t size;
  110. uint32_t crc32_prev;
  111. };
  112. journal_entry_start start;
  113. journal_entry_small_write small_write;
  114. journal_entry_big_write big_write;
  115. journal_entry_stable stable;
  116. journal_entry_rollback rollback;
  117. journal_entry_del del;
  118. };
  119. };
  120. inline uint32_t je_crc32(journal_entry *je)
  121. {
  122. // 0x48674bc7 = crc32(4 zero bytes)
  123. return crc32c(0x48674bc7, ((uint8_t*)je)+4, je->size-4);
  124. }
  125. struct journal_sector_info_t
  126. {
  127. uint64_t offset;
  128. uint64_t usage_count;
  129. bool written;
  130. bool dirty;
  131. };
  132. struct journal_t
  133. {
  134. int fd;
  135. uint64_t device_size;
  136. bool inmemory = false;
  137. void *buffer = NULL;
  138. uint64_t block_size;
  139. uint64_t offset, len;
  140. // Next free block offset
  141. uint64_t next_free = 0;
  142. // First occupied block offset
  143. uint64_t used_start = 0;
  144. // End of the last block not used for writing anymore
  145. uint64_t dirty_start = 0;
  146. uint32_t crc32_last = 0;
  147. // Current sector(s) used for writing
  148. void *sector_buf = NULL;
  149. journal_sector_info_t *sector_info = NULL;
  150. uint64_t sector_count;
  151. bool no_same_sector_overwrites = false;
  152. int cur_sector = 0;
  153. int in_sector_pos = 0;
  154. // Used sector map
  155. // May use ~ 80 MB per 1 GB of used journal space in the worst case
  156. std::map<uint64_t, uint64_t> used_sectors;
  157. ~journal_t();
  158. bool trim();
  159. uint64_t get_trim_pos();
  160. };
  161. struct blockstore_journal_check_t
  162. {
  163. blockstore_impl_t *bs;
  164. uint64_t next_pos, next_sector, next_in_pos;
  165. int sectors_required, first_sector;
  166. bool right_dir; // writing to the end or the beginning of the ring buffer
  167. blockstore_journal_check_t(blockstore_impl_t *bs);
  168. int check_available(blockstore_op_t *op, int required, int size, int data_after);
  169. };
  170. journal_entry* prefill_single_journal_entry(journal_t & journal, uint16_t type, uint32_t size);
  171. void prepare_journal_sector_write(journal_t & journal, int sector, io_uring_sqe *sqe, std::function<void(ring_data_t*)> cb);