Simplified distributed block storage with strong consistency, like in Ceph
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

373 lines
12 KiB

  1. // Copyright (c) Vitaliy Filippov, 2019+
  2. // License: VNPL-1.0 (see README.md for details)
  3. #include <sys/file.h>
  4. #include "blockstore_impl.h"
  5. static uint32_t is_power_of_two(uint64_t value)
  6. {
  7. uint32_t l = 0;
  8. while (value > 1)
  9. {
  10. if (value & 1)
  11. {
  12. return 64;
  13. }
  14. value = value >> 1;
  15. l++;
  16. }
  17. return l;
  18. }
  19. void blockstore_impl_t::parse_config(blockstore_config_t & config)
  20. {
  21. // Parse
  22. if (config["readonly"] == "true" || config["readonly"] == "1" || config["readonly"] == "yes")
  23. {
  24. readonly = true;
  25. }
  26. if (config["disable_data_fsync"] == "true" || config["disable_data_fsync"] == "1" || config["disable_data_fsync"] == "yes")
  27. {
  28. disable_data_fsync = true;
  29. }
  30. if (config["disable_meta_fsync"] == "true" || config["disable_meta_fsync"] == "1" || config["disable_meta_fsync"] == "yes")
  31. {
  32. disable_meta_fsync = true;
  33. }
  34. if (config["disable_journal_fsync"] == "true" || config["disable_journal_fsync"] == "1" || config["disable_journal_fsync"] == "yes")
  35. {
  36. disable_journal_fsync = true;
  37. }
  38. if (config["disable_device_lock"] == "true" || config["disable_device_lock"] == "1" || config["disable_device_lock"] == "yes")
  39. {
  40. disable_flock = true;
  41. }
  42. if (config["immediate_commit"] == "all")
  43. {
  44. immediate_commit = IMMEDIATE_ALL;
  45. }
  46. else if (config["immediate_commit"] == "small")
  47. {
  48. immediate_commit = IMMEDIATE_SMALL;
  49. }
  50. metadata_buf_size = strtoull(config["meta_buf_size"].c_str(), NULL, 10);
  51. cfg_journal_size = strtoull(config["journal_size"].c_str(), NULL, 10);
  52. data_device = config["data_device"];
  53. data_offset = strtoull(config["data_offset"].c_str(), NULL, 10);
  54. cfg_data_size = strtoull(config["data_size"].c_str(), NULL, 10);
  55. meta_device = config["meta_device"];
  56. meta_offset = strtoull(config["meta_offset"].c_str(), NULL, 10);
  57. entry_attr_size = strtoull(config["entry_attr_size"].c_str(), NULL, 10);
  58. block_size = strtoull(config["block_size"].c_str(), NULL, 10);
  59. inmemory_meta = config["inmemory_metadata"] != "false";
  60. journal_device = config["journal_device"];
  61. journal.offset = strtoull(config["journal_offset"].c_str(), NULL, 10);
  62. journal.sector_count = strtoull(config["journal_sector_buffer_count"].c_str(), NULL, 10);
  63. journal.no_same_sector_overwrites = config["journal_no_same_sector_overwrites"] == "true" ||
  64. config["journal_no_same_sector_overwrites"] == "1" || config["journal_no_same_sector_overwrites"] == "yes";
  65. journal.inmemory = config["inmemory_journal"] != "false";
  66. disk_alignment = strtoull(config["disk_alignment"].c_str(), NULL, 10);
  67. journal_block_size = strtoull(config["journal_block_size"].c_str(), NULL, 10);
  68. meta_block_size = strtoull(config["meta_block_size"].c_str(), NULL, 10);
  69. bitmap_granularity = strtoull(config["bitmap_granularity"].c_str(), NULL, 10);
  70. flusher_count = strtoull(config["flusher_count"].c_str(), NULL, 10);
  71. // Validate
  72. if (!block_size)
  73. {
  74. block_size = (1 << DEFAULT_ORDER);
  75. }
  76. if ((block_order = is_power_of_two(block_size)) >= 64 || block_size < MIN_BLOCK_SIZE || block_size >= MAX_BLOCK_SIZE)
  77. {
  78. throw std::runtime_error("Bad block size");
  79. }
  80. if (!flusher_count)
  81. {
  82. flusher_count = 32;
  83. }
  84. if (!disk_alignment)
  85. {
  86. disk_alignment = 4096;
  87. }
  88. else if (disk_alignment % MEM_ALIGNMENT)
  89. {
  90. throw std::runtime_error("disk_alignment must be a multiple of "+std::to_string(MEM_ALIGNMENT));
  91. }
  92. if (!journal_block_size)
  93. {
  94. journal_block_size = 4096;
  95. }
  96. else if (journal_block_size % MEM_ALIGNMENT)
  97. {
  98. throw std::runtime_error("journal_block_size must be a multiple of "+std::to_string(MEM_ALIGNMENT));
  99. }
  100. if (!meta_block_size)
  101. {
  102. meta_block_size = 4096;
  103. }
  104. else if (meta_block_size % MEM_ALIGNMENT)
  105. {
  106. throw std::runtime_error("meta_block_size must be a multiple of "+std::to_string(MEM_ALIGNMENT));
  107. }
  108. if (data_offset % disk_alignment)
  109. {
  110. throw std::runtime_error("data_offset must be a multiple of disk_alignment = "+std::to_string(disk_alignment));
  111. }
  112. if (!bitmap_granularity)
  113. {
  114. bitmap_granularity = DEFAULT_BITMAP_GRANULARITY;
  115. }
  116. else if (bitmap_granularity % disk_alignment)
  117. {
  118. throw std::runtime_error("Sparse write tracking granularity must be a multiple of disk_alignment = "+std::to_string(disk_alignment));
  119. }
  120. if (block_size % bitmap_granularity)
  121. {
  122. throw std::runtime_error("Block size must be a multiple of sparse write tracking granularity");
  123. }
  124. if (journal_device == meta_device || meta_device == "" && journal_device == data_device)
  125. {
  126. journal_device = "";
  127. }
  128. if (meta_device == data_device)
  129. {
  130. meta_device = "";
  131. }
  132. if (meta_offset % meta_block_size)
  133. {
  134. throw std::runtime_error("meta_offset must be a multiple of meta_block_size = "+std::to_string(meta_block_size));
  135. }
  136. if (journal.offset % journal_block_size)
  137. {
  138. throw std::runtime_error("journal_offset must be a multiple of journal_block_size = "+std::to_string(journal_block_size));
  139. }
  140. if (journal.sector_count < 2)
  141. {
  142. journal.sector_count = 32;
  143. }
  144. if (metadata_buf_size < 65536)
  145. {
  146. metadata_buf_size = 4*1024*1024;
  147. }
  148. if (meta_device == "")
  149. {
  150. disable_meta_fsync = disable_data_fsync;
  151. }
  152. if (journal_device == "")
  153. {
  154. disable_journal_fsync = disable_meta_fsync;
  155. }
  156. if (immediate_commit != IMMEDIATE_NONE && !disable_journal_fsync)
  157. {
  158. throw std::runtime_error("immediate_commit requires disable_journal_fsync");
  159. }
  160. if (immediate_commit == IMMEDIATE_ALL && !disable_data_fsync)
  161. {
  162. throw std::runtime_error("immediate_commit=all requires disable_journal_fsync and disable_data_fsync");
  163. }
  164. // init some fields
  165. clean_entry_bitmap_size = block_size / bitmap_granularity / 8;
  166. clean_entry_size = sizeof(clean_disk_entry) + clean_entry_bitmap_size + entry_attr_size;
  167. journal.block_size = journal_block_size;
  168. journal.next_free = journal_block_size;
  169. journal.used_start = journal_block_size;
  170. // no free space because sector is initially unmapped
  171. journal.in_sector_pos = journal_block_size;
  172. }
  173. void blockstore_impl_t::calc_lengths()
  174. {
  175. // data
  176. data_len = data_size - data_offset;
  177. if (data_fd == meta_fd && data_offset < meta_offset)
  178. {
  179. data_len = meta_offset - data_offset;
  180. }
  181. if (data_fd == journal.fd && data_offset < journal.offset)
  182. {
  183. data_len = data_len < journal.offset-data_offset
  184. ? data_len : journal.offset-data_offset;
  185. }
  186. if (cfg_data_size != 0)
  187. {
  188. if (data_len < cfg_data_size)
  189. {
  190. throw std::runtime_error("Data area ("+std::to_string(data_len)+
  191. " bytes) is less than configured size ("+std::to_string(cfg_data_size)+" bytes)");
  192. }
  193. data_len = cfg_data_size;
  194. }
  195. // meta
  196. meta_area = (meta_fd == data_fd ? data_size : meta_size) - meta_offset;
  197. if (meta_fd == data_fd && meta_offset <= data_offset)
  198. {
  199. meta_area = data_offset - meta_offset;
  200. }
  201. if (meta_fd == journal.fd && meta_offset <= journal.offset)
  202. {
  203. meta_area = meta_area < journal.offset-meta_offset
  204. ? meta_area : journal.offset-meta_offset;
  205. }
  206. // journal
  207. journal.len = (journal.fd == data_fd ? data_size : (journal.fd == meta_fd ? meta_size : journal.device_size)) - journal.offset;
  208. if (journal.fd == data_fd && journal.offset <= data_offset)
  209. {
  210. journal.len = data_offset - journal.offset;
  211. }
  212. if (journal.fd == meta_fd && journal.offset <= meta_offset)
  213. {
  214. journal.len = journal.len < meta_offset-journal.offset
  215. ? journal.len : meta_offset-journal.offset;
  216. }
  217. // required metadata size
  218. block_count = data_len / block_size;
  219. meta_len = ((block_count - 1 + meta_block_size / clean_entry_size) / (meta_block_size / clean_entry_size)) * meta_block_size;
  220. if (meta_area < meta_len)
  221. {
  222. throw std::runtime_error("Metadata area is too small, need at least "+std::to_string(meta_len)+" bytes");
  223. }
  224. if (inmemory_meta)
  225. {
  226. metadata_buffer = memalign(MEM_ALIGNMENT, meta_len);
  227. if (!metadata_buffer)
  228. throw std::runtime_error("Failed to allocate memory for the metadata");
  229. }
  230. else if (clean_entry_bitmap_size || entry_attr_size)
  231. {
  232. clean_bitmap = (uint8_t*)malloc(block_count * (clean_entry_bitmap_size + entry_attr_size));
  233. if (!clean_bitmap)
  234. throw std::runtime_error("Failed to allocate memory for the metadata sparse write bitmap");
  235. }
  236. // requested journal size
  237. if (cfg_journal_size > journal.len)
  238. {
  239. throw std::runtime_error("Requested journal_size is too large");
  240. }
  241. else if (cfg_journal_size > 0)
  242. {
  243. journal.len = cfg_journal_size;
  244. }
  245. if (journal.len < MIN_JOURNAL_SIZE)
  246. {
  247. throw std::runtime_error("Journal is too small, need at least "+std::to_string(MIN_JOURNAL_SIZE)+" bytes");
  248. }
  249. if (journal.inmemory)
  250. {
  251. journal.buffer = memalign(MEM_ALIGNMENT, journal.len);
  252. if (!journal.buffer)
  253. throw std::runtime_error("Failed to allocate memory for journal");
  254. }
  255. }
  256. void check_size(int fd, uint64_t *size, std::string name)
  257. {
  258. int sectsize;
  259. struct stat st;
  260. if (fstat(fd, &st) < 0)
  261. {
  262. throw std::runtime_error("Failed to stat "+name);
  263. }
  264. if (S_ISREG(st.st_mode))
  265. {
  266. *size = st.st_size;
  267. }
  268. else if (S_ISBLK(st.st_mode))
  269. {
  270. if (ioctl(fd, BLKSSZGET, &sectsize) < 0 ||
  271. ioctl(fd, BLKGETSIZE64, size) < 0 ||
  272. sectsize != 512)
  273. {
  274. throw std::runtime_error(name+" sector is not equal to 512 bytes");
  275. }
  276. }
  277. else
  278. {
  279. throw std::runtime_error(name+" is neither a file nor a block device");
  280. }
  281. }
  282. void blockstore_impl_t::open_data()
  283. {
  284. data_fd = open(data_device.c_str(), O_DIRECT|O_RDWR);
  285. if (data_fd == -1)
  286. {
  287. throw std::runtime_error("Failed to open data device");
  288. }
  289. check_size(data_fd, &data_size, "data device");
  290. if (data_offset >= data_size)
  291. {
  292. throw std::runtime_error("data_offset exceeds device size = "+std::to_string(data_size));
  293. }
  294. if (!disable_flock && flock(data_fd, LOCK_EX|LOCK_NB) != 0)
  295. {
  296. throw std::runtime_error(std::string("Failed to lock data device: ") + strerror(errno));
  297. }
  298. }
  299. void blockstore_impl_t::open_meta()
  300. {
  301. if (meta_device != "")
  302. {
  303. meta_offset = 0;
  304. meta_fd = open(meta_device.c_str(), O_DIRECT|O_RDWR);
  305. if (meta_fd == -1)
  306. {
  307. throw std::runtime_error("Failed to open metadata device");
  308. }
  309. check_size(meta_fd, &meta_size, "metadata device");
  310. if (meta_offset >= meta_size)
  311. {
  312. throw std::runtime_error("meta_offset exceeds device size = "+std::to_string(meta_size));
  313. }
  314. if (!disable_flock && flock(meta_fd, LOCK_EX|LOCK_NB) != 0)
  315. {
  316. throw std::runtime_error(std::string("Failed to lock metadata device: ") + strerror(errno));
  317. }
  318. }
  319. else
  320. {
  321. meta_fd = data_fd;
  322. meta_size = 0;
  323. if (meta_offset >= data_size)
  324. {
  325. throw std::runtime_error("meta_offset exceeds device size = "+std::to_string(data_size));
  326. }
  327. }
  328. }
  329. void blockstore_impl_t::open_journal()
  330. {
  331. if (journal_device != "")
  332. {
  333. journal.fd = open(journal_device.c_str(), O_DIRECT|O_RDWR);
  334. if (journal.fd == -1)
  335. {
  336. throw std::runtime_error("Failed to open journal device");
  337. }
  338. check_size(journal.fd, &journal.device_size, "journal device");
  339. if (!disable_flock && flock(journal.fd, LOCK_EX|LOCK_NB) != 0)
  340. {
  341. throw std::runtime_error(std::string("Failed to lock journal device: ") + strerror(errno));
  342. }
  343. }
  344. else
  345. {
  346. journal.fd = meta_fd;
  347. journal.device_size = 0;
  348. if (journal.offset >= data_size)
  349. {
  350. throw std::runtime_error("journal_offset exceeds device size");
  351. }
  352. }
  353. journal.sector_info = (journal_sector_info_t*)calloc(journal.sector_count, sizeof(journal_sector_info_t));
  354. if (!journal.sector_info)
  355. {
  356. throw std::bad_alloc();
  357. }
  358. if (!journal.inmemory)
  359. {
  360. journal.sector_buf = (uint8_t*)memalign(MEM_ALIGNMENT, journal.sector_count * journal_block_size);
  361. if (!journal.sector_buf)
  362. throw std::bad_alloc();
  363. }
  364. }