Simplified distributed block storage with strong consistency, like in Ceph
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

321 lines
11 KiB

  1. #include "blockstore_impl.h"
  2. static uint32_t is_power_of_two(uint64_t value)
  3. {
  4. uint32_t l = 0;
  5. while (value > 1)
  6. {
  7. if (value & 1)
  8. {
  9. return 64;
  10. }
  11. value = value >> 1;
  12. l++;
  13. }
  14. return l;
  15. }
  16. void blockstore_impl_t::parse_config(blockstore_config_t & config)
  17. {
  18. // Parse
  19. if (config["readonly"] == "true" || config["readonly"] == "1" || config["readonly"] == "yes")
  20. {
  21. readonly = true;
  22. }
  23. if (config["disable_data_fsync"] == "true" || config["disable_data_fsync"] == "1" || config["disable_data_fsync"] == "yes")
  24. {
  25. disable_data_fsync = true;
  26. }
  27. if (config["disable_meta_fsync"] == "true" || config["disable_meta_fsync"] == "1" || config["disable_meta_fsync"] == "yes")
  28. {
  29. disable_meta_fsync = true;
  30. }
  31. if (config["disable_journal_fsync"] == "true" || config["disable_journal_fsync"] == "1" || config["disable_journal_fsync"] == "yes")
  32. {
  33. disable_journal_fsync = true;
  34. }
  35. metadata_buf_size = strtoull(config["meta_buf_size"].c_str(), NULL, 10);
  36. cfg_journal_size = strtoull(config["journal_size"].c_str(), NULL, 10);
  37. data_device = config["data_device"];
  38. data_offset = strtoull(config["data_offset"].c_str(), NULL, 10);
  39. meta_device = config["meta_device"];
  40. meta_offset = strtoull(config["meta_offset"].c_str(), NULL, 10);
  41. block_size = strtoull(config["block_size"].c_str(), NULL, 10);
  42. inmemory_meta = config["inmemory_metadata"] != "false";
  43. journal_device = config["journal_device"];
  44. journal.offset = strtoull(config["journal_offset"].c_str(), NULL, 10);
  45. journal.sector_count = strtoull(config["journal_sector_buffer_count"].c_str(), NULL, 10);
  46. journal.inmemory = config["inmemory_journal"] != "false";
  47. disk_alignment = strtoull(config["disk_alignment"].c_str(), NULL, 10);
  48. journal_block_size = strtoull(config["journal_block_size"].c_str(), NULL, 10);
  49. meta_block_size = strtoull(config["meta_block_size"].c_str(), NULL, 10);
  50. bitmap_granularity = strtoull(config["bitmap_granularity"].c_str(), NULL, 10);
  51. flusher_count = strtoull(config["flusher_count"].c_str(), NULL, 10);
  52. // Validate
  53. if (!block_size)
  54. {
  55. block_size = (1 << DEFAULT_ORDER);
  56. }
  57. if ((block_order = is_power_of_two(block_size)) >= 64 || block_size < MIN_BLOCK_SIZE || block_size >= MAX_BLOCK_SIZE)
  58. {
  59. throw std::runtime_error("Bad block size");
  60. }
  61. if (!flusher_count)
  62. {
  63. flusher_count = 32;
  64. }
  65. if (!disk_alignment)
  66. {
  67. disk_alignment = 512;
  68. }
  69. else if (disk_alignment % MEM_ALIGNMENT)
  70. {
  71. throw std::runtime_error("disk_alingment must be a multiple of "+std::to_string(MEM_ALIGNMENT));
  72. }
  73. if (!journal_block_size)
  74. {
  75. journal_block_size = 512;
  76. }
  77. else if (journal_block_size % MEM_ALIGNMENT)
  78. {
  79. throw std::runtime_error("journal_block_size must be a multiple of "+std::to_string(MEM_ALIGNMENT));
  80. }
  81. if (!meta_block_size)
  82. {
  83. meta_block_size = 512;
  84. }
  85. else if (meta_block_size % MEM_ALIGNMENT)
  86. {
  87. throw std::runtime_error("meta_block_size must be a multiple of "+std::to_string(MEM_ALIGNMENT));
  88. }
  89. if (data_offset % disk_alignment)
  90. {
  91. throw std::runtime_error("data_offset must be a multiple of disk_alignment = "+std::to_string(disk_alignment));
  92. }
  93. if (!bitmap_granularity)
  94. {
  95. bitmap_granularity = 4096;
  96. }
  97. else if (bitmap_granularity % disk_alignment)
  98. {
  99. throw std::runtime_error("Sparse write tracking granularity must be a multiple of disk_alignment = "+std::to_string(disk_alignment));
  100. }
  101. if (block_size % bitmap_granularity)
  102. {
  103. throw std::runtime_error("Block size must be a multiple of sparse write tracking granularity");
  104. }
  105. if (journal_device == meta_device || meta_device == "" && journal_device == data_device)
  106. {
  107. journal_device = "";
  108. }
  109. if (meta_device == data_device)
  110. {
  111. meta_device = "";
  112. }
  113. if (meta_offset % meta_block_size)
  114. {
  115. throw std::runtime_error("meta_offset must be a multiple of meta_block_size = "+std::to_string(meta_block_size));
  116. }
  117. if (journal.offset % journal_block_size)
  118. {
  119. throw std::runtime_error("journal_offset must be a multiple of journal_block_size = "+std::to_string(journal_block_size));
  120. }
  121. if (journal.sector_count < 2)
  122. {
  123. journal.sector_count = 32;
  124. }
  125. if (metadata_buf_size < 65536)
  126. {
  127. metadata_buf_size = 4*1024*1024;
  128. }
  129. // init some fields
  130. clean_entry_bitmap_size = block_size / bitmap_granularity / 8;
  131. clean_entry_size = sizeof(clean_disk_entry) + clean_entry_bitmap_size;
  132. journal.block_size = journal_block_size;
  133. journal.next_free = journal_block_size;
  134. journal.used_start = journal_block_size;
  135. // no free space because sector is initially unmapped
  136. journal.in_sector_pos = journal_block_size;
  137. }
  138. void blockstore_impl_t::calc_lengths()
  139. {
  140. // register fds
  141. data_fd_index = ringloop->register_fd(data_fd);
  142. meta_fd_index = meta_fd == data_fd ? data_fd_index : ringloop->register_fd(meta_fd);
  143. journal.fd_index = journal_fd_index = journal.fd == meta_fd ? meta_fd_index : ringloop->register_fd(journal.fd);
  144. // data
  145. data_len = data_size - data_offset;
  146. if (data_fd == meta_fd && data_offset < meta_offset)
  147. {
  148. data_len = meta_offset - data_offset;
  149. }
  150. if (data_fd == journal.fd && data_offset < journal.offset)
  151. {
  152. data_len = data_len < journal.offset-data_offset
  153. ? data_len : journal.offset-data_offset;
  154. }
  155. // meta
  156. meta_area = (meta_fd == data_fd ? data_size : meta_size) - meta_offset;
  157. if (meta_fd == data_fd && meta_offset <= data_offset)
  158. {
  159. meta_area = data_offset - meta_offset;
  160. }
  161. if (meta_fd == journal.fd && meta_offset <= journal.offset)
  162. {
  163. meta_area = meta_area < journal.offset-meta_offset
  164. ? meta_area : journal.offset-meta_offset;
  165. }
  166. // journal
  167. journal.len = (journal.fd == data_fd ? data_size : (journal.fd == meta_fd ? meta_size : journal.device_size)) - journal.offset;
  168. if (journal.fd == data_fd && journal.offset <= data_offset)
  169. {
  170. journal.len = data_offset - journal.offset;
  171. }
  172. if (journal.fd == meta_fd && journal.offset <= meta_offset)
  173. {
  174. journal.len = journal.len < meta_offset-journal.offset
  175. ? journal.len : meta_offset-journal.offset;
  176. }
  177. // required metadata size
  178. block_count = data_len / block_size;
  179. meta_len = ((block_count - 1 + meta_block_size / clean_entry_size) / (meta_block_size / clean_entry_size)) * meta_block_size;
  180. if (meta_area < meta_len)
  181. {
  182. throw std::runtime_error("Metadata area is too small, need at least "+std::to_string(meta_len)+" bytes");
  183. }
  184. if (inmemory_meta)
  185. {
  186. metadata_buffer = memalign(MEM_ALIGNMENT, meta_len);
  187. if (!metadata_buffer)
  188. throw std::runtime_error("Failed to allocate memory for the metadata");
  189. }
  190. else if (clean_entry_bitmap_size)
  191. {
  192. clean_bitmap = (uint8_t*)malloc(block_count * clean_entry_bitmap_size);
  193. if (!clean_bitmap)
  194. throw std::runtime_error("Failed to allocate memory for the metadata sparse write bitmap");
  195. }
  196. // requested journal size
  197. if (cfg_journal_size > journal.len)
  198. {
  199. throw std::runtime_error("Requested journal_size is too large");
  200. }
  201. else if (cfg_journal_size > 0)
  202. {
  203. journal.len = cfg_journal_size;
  204. }
  205. if (journal.len < MIN_JOURNAL_SIZE)
  206. {
  207. throw std::runtime_error("Journal is too small, need at least "+std::to_string(MIN_JOURNAL_SIZE)+" bytes");
  208. }
  209. if (journal.inmemory)
  210. {
  211. journal.buffer = memalign(MEM_ALIGNMENT, journal.len);
  212. if (!journal.buffer)
  213. throw std::runtime_error("Failed to allocate memory for journal");
  214. }
  215. }
  216. void check_size(int fd, uint64_t *size, std::string name)
  217. {
  218. int sectsize;
  219. struct stat st;
  220. if (fstat(fd, &st) < 0)
  221. {
  222. throw std::runtime_error("Failed to stat "+name);
  223. }
  224. if (S_ISREG(st.st_mode))
  225. {
  226. *size = st.st_size;
  227. }
  228. else if (S_ISBLK(st.st_mode))
  229. {
  230. if (ioctl(fd, BLKSSZGET, &sectsize) < 0 ||
  231. ioctl(fd, BLKGETSIZE64, size) < 0 ||
  232. sectsize != 512)
  233. {
  234. throw std::runtime_error(name+" sector is not equal to 512 bytes");
  235. }
  236. }
  237. else
  238. {
  239. throw std::runtime_error(name+" is neither a file nor a block device");
  240. }
  241. }
  242. void blockstore_impl_t::open_data()
  243. {
  244. data_fd = open(data_device.c_str(), O_DIRECT|O_RDWR);
  245. if (data_fd == -1)
  246. {
  247. throw std::runtime_error("Failed to open data device");
  248. }
  249. check_size(data_fd, &data_size, "data device");
  250. if (data_offset >= data_size)
  251. {
  252. throw std::runtime_error("data_offset exceeds device size = "+std::to_string(data_size));
  253. }
  254. }
  255. void blockstore_impl_t::open_meta()
  256. {
  257. if (meta_device != "")
  258. {
  259. meta_offset = 0;
  260. meta_fd = open(meta_device.c_str(), O_DIRECT|O_RDWR);
  261. if (meta_fd == -1)
  262. {
  263. throw std::runtime_error("Failed to open metadata device");
  264. }
  265. check_size(meta_fd, &meta_size, "metadata device");
  266. if (meta_offset >= meta_size)
  267. {
  268. throw std::runtime_error("meta_offset exceeds device size = "+std::to_string(meta_size));
  269. }
  270. }
  271. else
  272. {
  273. meta_fd = data_fd;
  274. disable_meta_fsync = disable_data_fsync;
  275. meta_size = 0;
  276. if (meta_offset >= data_size)
  277. {
  278. throw std::runtime_error("meta_offset exceeds device size = "+std::to_string(data_size));
  279. }
  280. }
  281. }
  282. void blockstore_impl_t::open_journal()
  283. {
  284. if (journal_device != "")
  285. {
  286. journal.fd = open(journal_device.c_str(), O_DIRECT|O_RDWR);
  287. if (journal.fd == -1)
  288. {
  289. throw std::runtime_error("Failed to open journal device");
  290. }
  291. check_size(journal.fd, &journal.device_size, "metadata device");
  292. }
  293. else
  294. {
  295. journal.fd = meta_fd;
  296. disable_journal_fsync = disable_meta_fsync;
  297. journal.device_size = 0;
  298. if (journal.offset >= data_size)
  299. {
  300. throw std::runtime_error("journal_offset exceeds device size");
  301. }
  302. }
  303. journal.sector_info = (journal_sector_info_t*)calloc(journal.sector_count, sizeof(journal_sector_info_t));
  304. if (!journal.sector_info)
  305. {
  306. throw std::bad_alloc();
  307. }
  308. if (!journal.inmemory)
  309. {
  310. journal.sector_buf = (uint8_t*)memalign(MEM_ALIGNMENT, journal.sector_count * journal_block_size);
  311. if (!journal.sector_buf)
  312. throw std::bad_alloc();
  313. }
  314. }