Simplified distributed block storage with strong consistency, like in Ceph
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

247 lines
6.1 KiB

  1. // Copyright (c) Vitaliy Filippov, 2019+
  2. // License: VNPL-1.1 or GNU GPL-2.0+ (see README.md for details)
  3. #pragma once
  4. #include "object_id.h"
  5. #include "osd_id.h"
  6. // Magic numbers
  7. #define SECONDARY_OSD_OP_MAGIC 0x2bd7b10325434553l
  8. #define SECONDARY_OSD_REPLY_MAGIC 0xbaa699b87b434553l
  9. // Operation request / reply headers have fixed size after which comes data
  10. #define OSD_PACKET_SIZE 0x80
  11. // Opcodes
  12. #define OSD_OP_MIN 1
  13. #define OSD_OP_SEC_READ 1
  14. #define OSD_OP_SEC_WRITE 2
  15. #define OSD_OP_SEC_WRITE_STABLE 3
  16. #define OSD_OP_SEC_SYNC 4
  17. #define OSD_OP_SEC_STABILIZE 5
  18. #define OSD_OP_SEC_ROLLBACK 6
  19. #define OSD_OP_SEC_DELETE 7
  20. #define OSD_OP_TEST_SYNC_STAB_ALL 8
  21. #define OSD_OP_SEC_LIST 9
  22. #define OSD_OP_SHOW_CONFIG 10
  23. #define OSD_OP_READ 11
  24. #define OSD_OP_WRITE 12
  25. #define OSD_OP_SYNC 13
  26. #define OSD_OP_DELETE 14
  27. #define OSD_OP_PING 15
  28. #define OSD_OP_SEC_READ_BMP 16
  29. #define OSD_OP_MAX 16
  30. // Alignment & limit for read/write operations
  31. #ifndef MEM_ALIGNMENT
  32. #define MEM_ALIGNMENT 512
  33. #endif
  34. #define OSD_RW_MAX 64*1024*1024
  35. #define OSD_PROTOCOL_VERSION 1
  36. // common request and reply headers
  37. struct __attribute__((__packed__)) osd_op_header_t
  38. {
  39. // magic & protocol version
  40. uint64_t magic;
  41. // operation id
  42. uint64_t id;
  43. // operation type
  44. uint64_t opcode;
  45. };
  46. struct __attribute__((__packed__)) osd_reply_header_t
  47. {
  48. // magic & protocol version
  49. uint64_t magic;
  50. // operation id
  51. uint64_t id;
  52. // operation type
  53. uint64_t opcode;
  54. // return value
  55. int64_t retval;
  56. };
  57. // read or write to the secondary OSD
  58. struct __attribute__((__packed__)) osd_op_sec_rw_t
  59. {
  60. osd_op_header_t header;
  61. // object
  62. object_id oid;
  63. // read/write version (automatic or specific)
  64. // FIXME deny values close to UINT64_MAX
  65. uint64_t version;
  66. // offset
  67. uint32_t offset;
  68. // length
  69. uint32_t len;
  70. // bitmap/attribute length - bitmap comes after header, but before data
  71. uint32_t attr_len;
  72. uint32_t pad0;
  73. };
  74. struct __attribute__((__packed__)) osd_reply_sec_rw_t
  75. {
  76. osd_reply_header_t header;
  77. // for reads and writes: assigned or read version number
  78. uint64_t version;
  79. // for reads: bitmap/attribute length (just to double-check)
  80. uint32_t attr_len;
  81. uint32_t pad0;
  82. };
  83. // delete object on the secondary OSD
  84. struct __attribute__((__packed__)) osd_op_sec_del_t
  85. {
  86. osd_op_header_t header;
  87. // object
  88. object_id oid;
  89. // delete version (automatic or specific)
  90. uint64_t version;
  91. };
  92. struct __attribute__((__packed__)) osd_reply_sec_del_t
  93. {
  94. osd_reply_header_t header;
  95. uint64_t version;
  96. };
  97. // sync to the secondary OSD
  98. struct __attribute__((__packed__)) osd_op_sec_sync_t
  99. {
  100. osd_op_header_t header;
  101. };
  102. struct __attribute__((__packed__)) osd_reply_sec_sync_t
  103. {
  104. osd_reply_header_t header;
  105. };
  106. // stabilize or rollback objects on the secondary OSD
  107. struct __attribute__((__packed__)) osd_op_sec_stab_t
  108. {
  109. osd_op_header_t header;
  110. // obj_ver_id array length in bytes
  111. uint64_t len;
  112. };
  113. typedef osd_op_sec_stab_t osd_op_sec_rollback_t;
  114. struct __attribute__((__packed__)) osd_reply_sec_stab_t
  115. {
  116. osd_reply_header_t header;
  117. };
  118. typedef osd_reply_sec_stab_t osd_reply_sec_rollback_t;
  119. // bulk read bitmaps from a secondary OSD
  120. struct __attribute__((__packed__)) osd_op_sec_read_bmp_t
  121. {
  122. osd_op_header_t header;
  123. // obj_ver_id array length in bytes
  124. uint64_t len;
  125. };
  126. struct __attribute__((__packed__)) osd_reply_sec_read_bmp_t
  127. {
  128. // retval is payload length in bytes. payload is {version,bitmap}[]
  129. osd_reply_header_t header;
  130. };
  131. // show configuration
  132. struct __attribute__((__packed__)) osd_op_show_config_t
  133. {
  134. osd_op_header_t header;
  135. // JSON request length
  136. uint64_t json_len;
  137. };
  138. struct __attribute__((__packed__)) osd_reply_show_config_t
  139. {
  140. osd_reply_header_t header;
  141. };
  142. // list objects on replica
  143. struct __attribute__((__packed__)) osd_op_sec_list_t
  144. {
  145. osd_op_header_t header;
  146. // placement group total number and total count
  147. pg_num_t list_pg, pg_count;
  148. // size of an area that maps to one PG continuously
  149. uint64_t pg_stripe_size;
  150. // inode range (used to select pools)
  151. uint64_t min_inode, max_inode;
  152. };
  153. struct __attribute__((__packed__)) osd_reply_sec_list_t
  154. {
  155. osd_reply_header_t header;
  156. // stable object version count. header.retval = total object version count
  157. // FIXME: maybe change to the number of bytes in the reply...
  158. uint64_t stable_count;
  159. };
  160. // read or write to the primary OSD (must be within individual stripe)
  161. struct __attribute__((__packed__)) osd_op_rw_t
  162. {
  163. osd_op_header_t header;
  164. // inode
  165. uint64_t inode;
  166. // offset
  167. uint64_t offset;
  168. // length
  169. uint32_t len;
  170. // flags (for future)
  171. uint32_t flags;
  172. // inode metadata revision
  173. uint64_t meta_revision;
  174. };
  175. struct __attribute__((__packed__)) osd_reply_rw_t
  176. {
  177. osd_reply_header_t header;
  178. // for reads: bitmap length
  179. uint32_t bitmap_len;
  180. uint32_t pad0;
  181. };
  182. // sync to the primary OSD
  183. struct __attribute__((__packed__)) osd_op_sync_t
  184. {
  185. osd_op_header_t header;
  186. };
  187. struct __attribute__((__packed__)) osd_reply_sync_t
  188. {
  189. osd_reply_header_t header;
  190. };
  191. // FIXME it would be interesting to try to unify blockstore_op and osd_op formats
  192. union osd_any_op_t
  193. {
  194. osd_op_header_t hdr;
  195. osd_op_sec_rw_t sec_rw;
  196. osd_op_sec_del_t sec_del;
  197. osd_op_sec_sync_t sec_sync;
  198. osd_op_sec_stab_t sec_stab;
  199. osd_op_sec_read_bmp_t sec_read_bmp;
  200. osd_op_sec_list_t sec_list;
  201. osd_op_show_config_t show_conf;
  202. osd_op_rw_t rw;
  203. osd_op_sync_t sync;
  204. uint8_t buf[OSD_PACKET_SIZE];
  205. };
  206. union osd_any_reply_t
  207. {
  208. osd_reply_header_t hdr;
  209. osd_reply_sec_rw_t sec_rw;
  210. osd_reply_sec_del_t sec_del;
  211. osd_reply_sec_sync_t sec_sync;
  212. osd_reply_sec_stab_t sec_stab;
  213. osd_reply_sec_read_bmp_t sec_read_bmp;
  214. osd_reply_sec_list_t sec_list;
  215. osd_reply_show_config_t show_conf;
  216. osd_reply_rw_t rw;
  217. osd_reply_sync_t sync;
  218. uint8_t buf[OSD_PACKET_SIZE];
  219. };
  220. extern const char* osd_op_names[];