Simplified distributed block storage with strong consistency, like in Ceph
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

410 lines
14 KiB

  1. // Copyright (c) Vitaliy Filippov, 2019+
  2. // License: VNPL-1.1 (see README.md for details)
  3. #include <sys/file.h>
  4. #include "blockstore_impl.h"
  5. static uint32_t is_power_of_two(uint64_t value)
  6. {
  7. uint32_t l = 0;
  8. while (value > 1)
  9. {
  10. if (value & 1)
  11. {
  12. return 64;
  13. }
  14. value = value >> 1;
  15. l++;
  16. }
  17. return l;
  18. }
  19. void blockstore_impl_t::parse_config(blockstore_config_t & config)
  20. {
  21. // Parse
  22. if (config["readonly"] == "true" || config["readonly"] == "1" || config["readonly"] == "yes")
  23. {
  24. readonly = true;
  25. }
  26. if (config["disable_data_fsync"] == "true" || config["disable_data_fsync"] == "1" || config["disable_data_fsync"] == "yes")
  27. {
  28. disable_data_fsync = true;
  29. }
  30. if (config["disable_meta_fsync"] == "true" || config["disable_meta_fsync"] == "1" || config["disable_meta_fsync"] == "yes")
  31. {
  32. disable_meta_fsync = true;
  33. }
  34. if (config["disable_journal_fsync"] == "true" || config["disable_journal_fsync"] == "1" || config["disable_journal_fsync"] == "yes")
  35. {
  36. disable_journal_fsync = true;
  37. }
  38. if (config["disable_device_lock"] == "true" || config["disable_device_lock"] == "1" || config["disable_device_lock"] == "yes")
  39. {
  40. disable_flock = true;
  41. }
  42. if (config["flush_journal"] == "true" || config["flush_journal"] == "1" || config["flush_journal"] == "yes")
  43. {
  44. // Only flush journal and exit
  45. journal.flush_journal = true;
  46. }
  47. if (config["immediate_commit"] == "all")
  48. {
  49. immediate_commit = IMMEDIATE_ALL;
  50. }
  51. else if (config["immediate_commit"] == "small")
  52. {
  53. immediate_commit = IMMEDIATE_SMALL;
  54. }
  55. metadata_buf_size = strtoull(config["meta_buf_size"].c_str(), NULL, 10);
  56. cfg_journal_size = strtoull(config["journal_size"].c_str(), NULL, 10);
  57. data_device = config["data_device"];
  58. data_offset = strtoull(config["data_offset"].c_str(), NULL, 10);
  59. cfg_data_size = strtoull(config["data_size"].c_str(), NULL, 10);
  60. meta_device = config["meta_device"];
  61. meta_offset = strtoull(config["meta_offset"].c_str(), NULL, 10);
  62. block_size = strtoull(config["block_size"].c_str(), NULL, 10);
  63. inmemory_meta = config["inmemory_metadata"] != "false";
  64. journal_device = config["journal_device"];
  65. journal.offset = strtoull(config["journal_offset"].c_str(), NULL, 10);
  66. journal.sector_count = strtoull(config["journal_sector_buffer_count"].c_str(), NULL, 10);
  67. journal.no_same_sector_overwrites = config["journal_no_same_sector_overwrites"] == "true" ||
  68. config["journal_no_same_sector_overwrites"] == "1" || config["journal_no_same_sector_overwrites"] == "yes";
  69. journal.inmemory = config["inmemory_journal"] != "false";
  70. disk_alignment = strtoull(config["disk_alignment"].c_str(), NULL, 10);
  71. journal_block_size = strtoull(config["journal_block_size"].c_str(), NULL, 10);
  72. meta_block_size = strtoull(config["meta_block_size"].c_str(), NULL, 10);
  73. bitmap_granularity = strtoull(config["bitmap_granularity"].c_str(), NULL, 10);
  74. max_flusher_count = strtoull(config["max_flusher_count"].c_str(), NULL, 10);
  75. if (!max_flusher_count)
  76. max_flusher_count = strtoull(config["flusher_count"].c_str(), NULL, 10);
  77. min_flusher_count = strtoull(config["min_flusher_count"].c_str(), NULL, 10);
  78. max_write_iodepth = strtoull(config["max_write_iodepth"].c_str(), NULL, 10);
  79. throttle_small_writes = config["throttle_small_writes"] == "true" || config["throttle_small_writes"] == "1" || config["throttle_small_writes"] == "yes";
  80. throttle_target_iops = strtoull(config["throttle_target_iops"].c_str(), NULL, 10);
  81. throttle_target_mbs = strtoull(config["throttle_target_mbs"].c_str(), NULL, 10);
  82. throttle_target_parallelism = strtoull(config["throttle_target_parallelism"].c_str(), NULL, 10);
  83. throttle_threshold_us = strtoull(config["throttle_threshold_us"].c_str(), NULL, 10);
  84. // Validate
  85. if (!block_size)
  86. {
  87. block_size = (1 << DEFAULT_ORDER);
  88. }
  89. if ((block_order = is_power_of_two(block_size)) >= 64 || block_size < MIN_BLOCK_SIZE || block_size >= MAX_BLOCK_SIZE)
  90. {
  91. throw std::runtime_error("Bad block size");
  92. }
  93. if (!max_flusher_count)
  94. {
  95. max_flusher_count = 256;
  96. }
  97. if (!min_flusher_count || journal.flush_journal)
  98. {
  99. min_flusher_count = 1;
  100. }
  101. if (!max_write_iodepth)
  102. {
  103. max_write_iodepth = 128;
  104. }
  105. if (!disk_alignment)
  106. {
  107. disk_alignment = 4096;
  108. }
  109. else if (disk_alignment % MEM_ALIGNMENT)
  110. {
  111. throw std::runtime_error("disk_alignment must be a multiple of "+std::to_string(MEM_ALIGNMENT));
  112. }
  113. if (!journal_block_size)
  114. {
  115. journal_block_size = 4096;
  116. }
  117. else if (journal_block_size % MEM_ALIGNMENT)
  118. {
  119. throw std::runtime_error("journal_block_size must be a multiple of "+std::to_string(MEM_ALIGNMENT));
  120. }
  121. if (!meta_block_size)
  122. {
  123. meta_block_size = 4096;
  124. }
  125. else if (meta_block_size % MEM_ALIGNMENT)
  126. {
  127. throw std::runtime_error("meta_block_size must be a multiple of "+std::to_string(MEM_ALIGNMENT));
  128. }
  129. if (data_offset % disk_alignment)
  130. {
  131. throw std::runtime_error("data_offset must be a multiple of disk_alignment = "+std::to_string(disk_alignment));
  132. }
  133. if (!bitmap_granularity)
  134. {
  135. bitmap_granularity = DEFAULT_BITMAP_GRANULARITY;
  136. }
  137. else if (bitmap_granularity % disk_alignment)
  138. {
  139. throw std::runtime_error("Sparse write tracking granularity must be a multiple of disk_alignment = "+std::to_string(disk_alignment));
  140. }
  141. if (block_size % bitmap_granularity)
  142. {
  143. throw std::runtime_error("Block size must be a multiple of sparse write tracking granularity");
  144. }
  145. if (journal_device == meta_device || meta_device == "" && journal_device == data_device)
  146. {
  147. journal_device = "";
  148. }
  149. if (meta_device == data_device)
  150. {
  151. meta_device = "";
  152. }
  153. if (meta_offset % meta_block_size)
  154. {
  155. throw std::runtime_error("meta_offset must be a multiple of meta_block_size = "+std::to_string(meta_block_size));
  156. }
  157. if (journal.offset % journal_block_size)
  158. {
  159. throw std::runtime_error("journal_offset must be a multiple of journal_block_size = "+std::to_string(journal_block_size));
  160. }
  161. if (journal.sector_count < 2)
  162. {
  163. journal.sector_count = 32;
  164. }
  165. if (metadata_buf_size < 65536)
  166. {
  167. metadata_buf_size = 4*1024*1024;
  168. }
  169. if (meta_device == "")
  170. {
  171. disable_meta_fsync = disable_data_fsync;
  172. }
  173. if (journal_device == "")
  174. {
  175. disable_journal_fsync = disable_meta_fsync;
  176. }
  177. if (immediate_commit != IMMEDIATE_NONE && !disable_journal_fsync)
  178. {
  179. throw std::runtime_error("immediate_commit requires disable_journal_fsync");
  180. }
  181. if (immediate_commit == IMMEDIATE_ALL && !disable_data_fsync)
  182. {
  183. throw std::runtime_error("immediate_commit=all requires disable_journal_fsync and disable_data_fsync");
  184. }
  185. if (!throttle_target_iops)
  186. {
  187. throttle_target_iops = 100;
  188. }
  189. if (!throttle_target_mbs)
  190. {
  191. throttle_target_mbs = 100;
  192. }
  193. if (!throttle_target_parallelism)
  194. {
  195. throttle_target_parallelism = 1;
  196. }
  197. if (!throttle_threshold_us)
  198. {
  199. throttle_threshold_us = 50;
  200. }
  201. // init some fields
  202. clean_entry_bitmap_size = block_size / bitmap_granularity / 8;
  203. clean_entry_size = sizeof(clean_disk_entry) + 2*clean_entry_bitmap_size;
  204. journal.block_size = journal_block_size;
  205. journal.next_free = journal_block_size;
  206. journal.used_start = journal_block_size;
  207. // no free space because sector is initially unmapped
  208. journal.in_sector_pos = journal_block_size;
  209. }
  210. void blockstore_impl_t::calc_lengths()
  211. {
  212. // data
  213. data_len = data_size - data_offset;
  214. if (data_fd == meta_fd && data_offset < meta_offset)
  215. {
  216. data_len = meta_offset - data_offset;
  217. }
  218. if (data_fd == journal.fd && data_offset < journal.offset)
  219. {
  220. data_len = data_len < journal.offset-data_offset
  221. ? data_len : journal.offset-data_offset;
  222. }
  223. if (cfg_data_size != 0)
  224. {
  225. if (data_len < cfg_data_size)
  226. {
  227. throw std::runtime_error("Data area ("+std::to_string(data_len)+
  228. " bytes) is less than configured size ("+std::to_string(cfg_data_size)+" bytes)");
  229. }
  230. data_len = cfg_data_size;
  231. }
  232. // meta
  233. meta_area = (meta_fd == data_fd ? data_size : meta_size) - meta_offset;
  234. if (meta_fd == data_fd && meta_offset <= data_offset)
  235. {
  236. meta_area = data_offset - meta_offset;
  237. }
  238. if (meta_fd == journal.fd && meta_offset <= journal.offset)
  239. {
  240. meta_area = meta_area < journal.offset-meta_offset
  241. ? meta_area : journal.offset-meta_offset;
  242. }
  243. // journal
  244. journal.len = (journal.fd == data_fd ? data_size : (journal.fd == meta_fd ? meta_size : journal.device_size)) - journal.offset;
  245. if (journal.fd == data_fd && journal.offset <= data_offset)
  246. {
  247. journal.len = data_offset - journal.offset;
  248. }
  249. if (journal.fd == meta_fd && journal.offset <= meta_offset)
  250. {
  251. journal.len = journal.len < meta_offset-journal.offset
  252. ? journal.len : meta_offset-journal.offset;
  253. }
  254. // required metadata size
  255. block_count = data_len / block_size;
  256. meta_len = (1 + (block_count - 1 + meta_block_size / clean_entry_size) / (meta_block_size / clean_entry_size)) * meta_block_size;
  257. if (meta_area < meta_len)
  258. {
  259. throw std::runtime_error("Metadata area is too small, need at least "+std::to_string(meta_len)+" bytes");
  260. }
  261. if (inmemory_meta)
  262. {
  263. metadata_buffer = memalign(MEM_ALIGNMENT, meta_len);
  264. if (!metadata_buffer)
  265. throw std::runtime_error("Failed to allocate memory for the metadata");
  266. }
  267. else if (clean_entry_bitmap_size)
  268. {
  269. clean_bitmap = (uint8_t*)malloc(block_count * 2*clean_entry_bitmap_size);
  270. if (!clean_bitmap)
  271. throw std::runtime_error("Failed to allocate memory for the metadata sparse write bitmap");
  272. }
  273. // requested journal size
  274. if (cfg_journal_size > journal.len)
  275. {
  276. throw std::runtime_error("Requested journal_size is too large");
  277. }
  278. else if (cfg_journal_size > 0)
  279. {
  280. journal.len = cfg_journal_size;
  281. }
  282. if (journal.len < MIN_JOURNAL_SIZE)
  283. {
  284. throw std::runtime_error("Journal is too small, need at least "+std::to_string(MIN_JOURNAL_SIZE)+" bytes");
  285. }
  286. if (journal.inmemory)
  287. {
  288. journal.buffer = memalign(MEM_ALIGNMENT, journal.len);
  289. if (!journal.buffer)
  290. throw std::runtime_error("Failed to allocate memory for journal");
  291. }
  292. }
  293. void check_size(int fd, uint64_t *size, std::string name)
  294. {
  295. int sectsize;
  296. struct stat st;
  297. if (fstat(fd, &st) < 0)
  298. {
  299. throw std::runtime_error("Failed to stat "+name);
  300. }
  301. if (S_ISREG(st.st_mode))
  302. {
  303. *size = st.st_size;
  304. }
  305. else if (S_ISBLK(st.st_mode))
  306. {
  307. if (ioctl(fd, BLKSSZGET, &sectsize) < 0 ||
  308. ioctl(fd, BLKGETSIZE64, size) < 0 ||
  309. sectsize != 512)
  310. {
  311. throw std::runtime_error(name+" sector is not equal to 512 bytes");
  312. }
  313. }
  314. else
  315. {
  316. throw std::runtime_error(name+" is neither a file nor a block device");
  317. }
  318. }
  319. void blockstore_impl_t::open_data()
  320. {
  321. data_fd = open(data_device.c_str(), O_DIRECT|O_RDWR);
  322. if (data_fd == -1)
  323. {
  324. throw std::runtime_error("Failed to open data device");
  325. }
  326. check_size(data_fd, &data_size, "data device");
  327. if (data_offset >= data_size)
  328. {
  329. throw std::runtime_error("data_offset exceeds device size = "+std::to_string(data_size));
  330. }
  331. if (!disable_flock && flock(data_fd, LOCK_EX|LOCK_NB) != 0)
  332. {
  333. throw std::runtime_error(std::string("Failed to lock data device: ") + strerror(errno));
  334. }
  335. }
  336. void blockstore_impl_t::open_meta()
  337. {
  338. if (meta_device != "")
  339. {
  340. meta_offset = 0;
  341. meta_fd = open(meta_device.c_str(), O_DIRECT|O_RDWR);
  342. if (meta_fd == -1)
  343. {
  344. throw std::runtime_error("Failed to open metadata device");
  345. }
  346. check_size(meta_fd, &meta_size, "metadata device");
  347. if (meta_offset >= meta_size)
  348. {
  349. throw std::runtime_error("meta_offset exceeds device size = "+std::to_string(meta_size));
  350. }
  351. if (!disable_flock && flock(meta_fd, LOCK_EX|LOCK_NB) != 0)
  352. {
  353. throw std::runtime_error(std::string("Failed to lock metadata device: ") + strerror(errno));
  354. }
  355. }
  356. else
  357. {
  358. meta_fd = data_fd;
  359. meta_size = 0;
  360. if (meta_offset >= data_size)
  361. {
  362. throw std::runtime_error("meta_offset exceeds device size = "+std::to_string(data_size));
  363. }
  364. }
  365. }
  366. void blockstore_impl_t::open_journal()
  367. {
  368. if (journal_device != "")
  369. {
  370. journal.fd = open(journal_device.c_str(), O_DIRECT|O_RDWR);
  371. if (journal.fd == -1)
  372. {
  373. throw std::runtime_error("Failed to open journal device");
  374. }
  375. check_size(journal.fd, &journal.device_size, "journal device");
  376. if (!disable_flock && flock(journal.fd, LOCK_EX|LOCK_NB) != 0)
  377. {
  378. throw std::runtime_error(std::string("Failed to lock journal device: ") + strerror(errno));
  379. }
  380. }
  381. else
  382. {
  383. journal.fd = meta_fd;
  384. journal.device_size = 0;
  385. if (journal.offset >= data_size)
  386. {
  387. throw std::runtime_error("journal_offset exceeds device size");
  388. }
  389. }
  390. journal.sector_info = (journal_sector_info_t*)calloc(journal.sector_count, sizeof(journal_sector_info_t));
  391. if (!journal.sector_info)
  392. {
  393. throw std::bad_alloc();
  394. }
  395. if (!journal.inmemory)
  396. {
  397. journal.sector_buf = (uint8_t*)memalign(MEM_ALIGNMENT, journal.sector_count * journal_block_size);
  398. if (!journal.sector_buf)
  399. throw std::bad_alloc();
  400. }
  401. }