Simplified distributed block storage with strong consistency, like in Ceph
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

225 lines
7.4 KiB

  1. // Copyright (c) Vitaliy Filippov, 2019+
  2. // License: VNPL-1.0 (see README.md for details)
  3. #define _LARGEFILE64_SOURCE
  4. #include <sys/types.h>
  5. #include <sys/ioctl.h>
  6. #include <sys/stat.h>
  7. #include <sys/time.h>
  8. #include <fcntl.h>
  9. #include <unistd.h>
  10. #include <stdint.h>
  11. #include <malloc.h>
  12. #include <linux/fs.h>
  13. #include <string.h>
  14. #include <errno.h>
  15. #include <assert.h>
  16. #include <stdio.h>
  17. #include "blockstore_impl.h"
  18. #include "crc32c.h"
  19. struct journal_dump_t
  20. {
  21. char *journal_device;
  22. uint32_t journal_block;
  23. uint64_t journal_offset;
  24. uint64_t journal_len;
  25. uint64_t journal_pos;
  26. bool all;
  27. bool started;
  28. int fd;
  29. uint32_t crc32_last;
  30. int dump_block(void *buf);
  31. };
  32. int main(int argc, char *argv[])
  33. {
  34. journal_dump_t self = { 0 };
  35. int b = 1;
  36. if (argc >= 2 && !strcmp(argv[1], "--all"))
  37. {
  38. self.all = true;
  39. b = 2;
  40. }
  41. if (argc < b+4)
  42. {
  43. printf("USAGE: %s [--all] <journal_file> <journal_block_size> <offset> <size>\n", argv[0]);
  44. return 1;
  45. }
  46. self.journal_device = argv[b];
  47. self.journal_block = strtoul(argv[b+1], NULL, 10);
  48. self.journal_offset = strtoull(argv[b+2], NULL, 10);
  49. self.journal_len = strtoull(argv[b+3], NULL, 10);
  50. if (self.journal_block < MEM_ALIGNMENT || (self.journal_block % MEM_ALIGNMENT) ||
  51. self.journal_block > 128*1024)
  52. {
  53. printf("Invalid journal block size\n");
  54. return 1;
  55. }
  56. self.fd = open(self.journal_device, O_DIRECT|O_RDONLY);
  57. if (self.fd == -1)
  58. {
  59. printf("Failed to open journal\n");
  60. return 1;
  61. }
  62. void *data = memalign(MEM_ALIGNMENT, self.journal_block);
  63. self.journal_pos = 0;
  64. if (self.all)
  65. {
  66. while (self.journal_pos < self.journal_len)
  67. {
  68. int r = pread(self.fd, data, self.journal_block, self.journal_offset+self.journal_pos);
  69. assert(r == self.journal_block);
  70. uint64_t s;
  71. for (s = 0; s < self.journal_block; s += 8)
  72. {
  73. if (*((uint64_t*)(data+s)) != 0)
  74. break;
  75. }
  76. if (s == self.journal_block)
  77. {
  78. printf("offset %08lx: zeroes\n", self.journal_pos);
  79. self.journal_pos += self.journal_block;
  80. }
  81. else if (((journal_entry*)data)->magic == JOURNAL_MAGIC)
  82. {
  83. printf("offset %08lx:\n", self.journal_pos);
  84. self.dump_block(data);
  85. }
  86. else
  87. {
  88. printf("offset %08lx: no magic in the beginning, looks like random data (pattern=%lx)\n", self.journal_pos, *((uint64_t*)data));
  89. self.journal_pos += self.journal_block;
  90. }
  91. }
  92. }
  93. else
  94. {
  95. int r = pread(self.fd, data, self.journal_block, self.journal_offset+self.journal_pos);
  96. assert(r == self.journal_block);
  97. journal_entry *je = (journal_entry*)(data);
  98. if (je->magic != JOURNAL_MAGIC || je->type != JE_START || je_crc32(je) != je->crc32)
  99. {
  100. printf("offset %08lx: journal superblock is invalid\n", self.journal_pos);
  101. }
  102. else
  103. {
  104. printf("offset %08lx:\n", self.journal_pos);
  105. self.dump_block(data);
  106. self.started = false;
  107. self.journal_pos = je->start.journal_start;
  108. while (1)
  109. {
  110. if (self.journal_pos >= self.journal_len)
  111. self.journal_pos = self.journal_block;
  112. r = pread(self.fd, data, self.journal_block, self.journal_offset+self.journal_pos);
  113. assert(r == self.journal_block);
  114. printf("offset %08lx:\n", self.journal_pos);
  115. r = self.dump_block(data);
  116. if (r <= 0)
  117. {
  118. printf("end of the journal\n");
  119. break;
  120. }
  121. }
  122. }
  123. }
  124. free(data);
  125. close(self.fd);
  126. return 0;
  127. }
  128. int journal_dump_t::dump_block(void *buf)
  129. {
  130. uint32_t pos = 0;
  131. journal_pos += journal_block;
  132. int entry = 0;
  133. bool wrapped = false;
  134. while (pos < journal_block)
  135. {
  136. journal_entry *je = (journal_entry*)(buf + pos);
  137. if (je->magic != JOURNAL_MAGIC || je->type < JE_MIN || je->type > JE_MAX ||
  138. !all && started && je->crc32_prev != crc32_last)
  139. {
  140. break;
  141. }
  142. bool crc32_valid = je_crc32(je) == je->crc32;
  143. if (!all && !crc32_valid)
  144. {
  145. break;
  146. }
  147. started = true;
  148. crc32_last = je->crc32;
  149. printf("entry % 3d: crc32=%08x %s prev=%08x ", entry, je->crc32, (crc32_valid ? "(valid)" : "(invalid)"), je->crc32_prev);
  150. if (je->type == JE_START)
  151. {
  152. printf("je_start start=%08lx\n", je->start.journal_start);
  153. }
  154. else if (je->type == JE_SMALL_WRITE || je->type == JE_SMALL_WRITE_INSTANT)
  155. {
  156. printf(
  157. "je_small_write%s oid=%lx:%lx ver=%lu offset=%u len=%u loc=%08lx",
  158. je->type == JE_SMALL_WRITE_INSTANT ? "_instant" : "",
  159. je->small_write.oid.inode, je->small_write.oid.stripe,
  160. je->small_write.version, je->small_write.offset, je->small_write.len,
  161. je->small_write.data_offset
  162. );
  163. if (journal_pos + je->small_write.len > journal_len)
  164. {
  165. // data continues from the beginning of the journal
  166. journal_pos = journal_block;
  167. wrapped = true;
  168. }
  169. if (journal_pos != je->small_write.data_offset)
  170. {
  171. printf(" (mismatched, calculated = %lu)", journal_pos);
  172. }
  173. journal_pos += je->small_write.len;
  174. if (journal_pos >= journal_len)
  175. {
  176. journal_pos = journal_block;
  177. wrapped = true;
  178. }
  179. uint32_t data_crc32 = 0;
  180. void *data = memalign(MEM_ALIGNMENT, je->small_write.len);
  181. assert(pread(fd, data, je->small_write.len, journal_offset+je->small_write.data_offset) == je->small_write.len);
  182. data_crc32 = crc32c(0, data, je->small_write.len);
  183. free(data);
  184. printf(
  185. " data_crc32=%08x%s", je->small_write.crc32_data,
  186. (data_crc32 != je->small_write.crc32_data) ? " (invalid)" : " (valid)"
  187. );
  188. printf("\n");
  189. }
  190. else if (je->type == JE_BIG_WRITE || je->type == JE_BIG_WRITE_INSTANT)
  191. {
  192. printf(
  193. "je_big_write%s oid=%lx:%lx ver=%lu loc=%08lx\n",
  194. je->type == JE_BIG_WRITE_INSTANT ? "_instant" : "",
  195. je->big_write.oid.inode, je->big_write.oid.stripe, je->big_write.version, je->big_write.location
  196. );
  197. }
  198. else if (je->type == JE_STABLE)
  199. {
  200. printf("je_stable oid=%lx:%lx ver=%lu\n", je->stable.oid.inode, je->stable.oid.stripe, je->stable.version);
  201. }
  202. else if (je->type == JE_ROLLBACK)
  203. {
  204. printf("je_rollback oid=%lx:%lx ver=%lu\n", je->rollback.oid.inode, je->rollback.oid.stripe, je->rollback.version);
  205. }
  206. else if (je->type == JE_DELETE)
  207. {
  208. printf("je_delete oid=%lx:%lx ver=%lu\n", je->del.oid.inode, je->del.oid.stripe, je->del.version);
  209. }
  210. pos += je->size;
  211. entry++;
  212. }
  213. if (wrapped)
  214. {
  215. journal_pos = journal_len;
  216. }
  217. return entry;
  218. }