Simplified distributed block storage with strong consistency, like in Ceph
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

200 lines
5.6 KiB

  1. // Copyright (c) Vitaliy Filippov, 2019+
  2. // License: VNPL-1.0 (see README.md for details)
  3. #pragma once
  4. #ifndef _LARGEFILE64_SOURCE
  5. #define _LARGEFILE64_SOURCE
  6. #endif
  7. #include <stdint.h>
  8. #include <string>
  9. #include <map>
  10. #include <unordered_map>
  11. #include <functional>
  12. #include "object_id.h"
  13. #include "ringloop.h"
  14. // Memory alignment for direct I/O (usually 512 bytes)
  15. // All other alignments must be a multiple of this one
  16. #ifndef MEM_ALIGNMENT
  17. #define MEM_ALIGNMENT 512
  18. #endif
  19. // Default block size is 128 KB, current allowed range is 4K - 128M
  20. #define DEFAULT_ORDER 17
  21. #define MIN_BLOCK_SIZE 4*1024
  22. #define MAX_BLOCK_SIZE 128*1024*1024
  23. #define DEFAULT_BITMAP_GRANULARITY 4096
  24. #define BS_OP_MIN 1
  25. #define BS_OP_READ 1
  26. #define BS_OP_WRITE 2
  27. #define BS_OP_WRITE_STABLE 3
  28. #define BS_OP_SYNC 4
  29. #define BS_OP_STABLE 5
  30. #define BS_OP_DELETE 6
  31. #define BS_OP_LIST 7
  32. #define BS_OP_ROLLBACK 8
  33. #define BS_OP_SYNC_STAB_ALL 9
  34. #define BS_OP_MAX 9
  35. #define BS_OP_PRIVATE_DATA_SIZE 256
  36. /*
  37. Blockstore opcode documentation:
  38. ## BS_OP_READ / BS_OP_WRITE / BS_OP_WRITE_STABLE
  39. Read or write object data. WRITE_STABLE writes a version that doesn't require marking as stable.
  40. Input:
  41. - oid = requested object
  42. - version = requested version.
  43. For reads:
  44. - version == 0: read the last stable version,
  45. - version == UINT64_MAX: read the last version,
  46. - otherwise: read the newest version that is <= the specified version
  47. - reads aren't guaranteed to return data from previous unfinished writes
  48. For writes:
  49. - if version == 0, a new version is assigned automatically
  50. - if version != 0, it is assigned for the new write if possible, otherwise -EINVAL is returned
  51. - offset, len = offset and length within object. length may be zero, in that case
  52. read operation only returns the version / write operation only bumps the version
  53. - buf = pre-allocated buffer for data (read) / with data (write). may be NULL if len == 0.
  54. - bitmap = pointer to <entry_attr_size> bytes long (usually very short) arbitrary data
  55. stored for each object in the metadata area.
  56. Called "bitmap" because it's used for the "external bitmap" in Vitastor.
  57. Output:
  58. - retval = number of bytes actually read/written or negative error number (-EINVAL or -ENOSPC)
  59. - version = the version actually read or written
  60. ## BS_OP_DELETE
  61. Delete an object.
  62. Input:
  63. - oid = requested object
  64. - version = requested version. Treated the same as with BS_OP_WRITE
  65. Output:
  66. - retval = 0 or negative error number (-EINVAL)
  67. - version = the version actually written (delete is initially written as an object version)
  68. ## BS_OP_SYNC
  69. Make sure all previously issued modifications reach physical media.
  70. Input: Nothing except opcode
  71. Output:
  72. - retval = 0 or negative error number (-EINVAL)
  73. ## BS_OP_STABLE / BS_OP_ROLLBACK
  74. Mark objects as stable / rollback previous unstable writes.
  75. Input:
  76. - len = count of obj_ver_id's to stabilize or rollback
  77. - stabilize: all object versions up to the requested version of each object are marked as stable
  78. - rollback: all objects are rolled back to the requested stable versions
  79. - buf = pre-allocated obj_ver_id array <len> units long
  80. Output:
  81. - retval = 0 or negative error number (-EINVAL, -ENOENT if no such version or -EBUSY if not synced)
  82. ## BS_OP_SYNC_STAB_ALL
  83. ONLY FOR TESTS! Sync and mark all unstable object versions as stable, at once.
  84. Input: Nothing except opcode
  85. Output:
  86. - retval = 0 or negative error number (-EINVAL)
  87. ## BS_OP_LIST
  88. Get a list of all objects in this Blockstore.
  89. Input:
  90. - oid.stripe = PG alignment
  91. - len = PG count or 0 to list all objects
  92. - offset = PG number
  93. - oid.inode = min inode number or 0 to list all inodes
  94. - version = max inode number or 0 to list all inodes
  95. Output:
  96. - retval = total obj_ver_id count
  97. - version = stable obj_ver_id count
  98. - buf = obj_ver_id array allocated by the blockstore. Stable versions come first.
  99. You must free it yourself after usage with free().
  100. Output includes all objects for which (((inode + stripe / <PG alignment>) % <PG count>) == <PG number>).
  101. */
  102. struct blockstore_op_t
  103. {
  104. // operation
  105. uint64_t opcode;
  106. // finish callback
  107. std::function<void (blockstore_op_t*)> callback;
  108. object_id oid;
  109. uint64_t version;
  110. uint32_t offset;
  111. uint32_t len;
  112. void *buf;
  113. void *bitmap;
  114. int retval;
  115. uint8_t private_data[BS_OP_PRIVATE_DATA_SIZE];
  116. };
  117. typedef std::unordered_map<std::string, std::string> blockstore_config_t;
  118. class blockstore_impl_t;
  119. class blockstore_t
  120. {
  121. blockstore_impl_t *impl;
  122. public:
  123. blockstore_t(blockstore_config_t & config, ring_loop_t *ringloop);
  124. ~blockstore_t();
  125. // Event loop
  126. void loop();
  127. // Returns true when blockstore is ready to process operations
  128. // (Although you're free to enqueue them before that)
  129. bool is_started();
  130. // Returns true when blockstore is stalled
  131. bool is_stalled();
  132. // Returns true when it's safe to destroy the instance. If destroying the instance
  133. // requires to purge some queues, starts that process. Should be called in the event
  134. // loop until it returns true.
  135. bool is_safe_to_stop();
  136. // Submission
  137. void enqueue_op(blockstore_op_t *op);
  138. // Insert operation into the beginning of the queue
  139. // Intended for the OSD syncer "thread" to be able to stabilize something when the journal is full
  140. void enqueue_op_first(blockstore_op_t *op);
  141. // Unstable writes are added here (map of object_id -> version)
  142. std::unordered_map<object_id, uint64_t> & get_unstable_writes();
  143. // Get per-inode space usage statistics
  144. std::map<uint64_t, uint64_t> & get_inode_space_stats();
  145. // FIXME rename to object_size
  146. uint32_t get_block_size();
  147. uint64_t get_block_count();
  148. uint64_t get_free_block_count();
  149. uint32_t get_bitmap_granularity();
  150. };