Simplified distributed block storage with strong consistency, like in Ceph
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

498 lines
18 KiB

2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
2 years ago
1 year ago
1 year ago
2 years ago
2 years ago
2 years ago
2 years ago
  1. #include "blockstore_impl.h"
  2. bool blockstore_impl_t::enqueue_write(blockstore_op_t *op)
  3. {
  4. // Check or assign version number
  5. bool found = false, deleted = false, is_del = (op->opcode == BS_OP_DELETE);
  6. bool is_inflight_big = false;
  7. uint64_t version = 1;
  8. if (dirty_db.size() > 0)
  9. {
  10. auto dirty_it = dirty_db.upper_bound((obj_ver_id){
  11. .oid = op->oid,
  12. .version = UINT64_MAX,
  13. });
  14. dirty_it--; // segfaults when dirty_db is empty
  15. if (dirty_it != dirty_db.end() && dirty_it->first.oid == op->oid)
  16. {
  17. found = true;
  18. version = dirty_it->first.version + 1;
  19. deleted = IS_DELETE(dirty_it->second.state);
  20. is_inflight_big = dirty_it->second.state >= ST_D_IN_FLIGHT &&
  21. dirty_it->second.state < ST_D_SYNCED ||
  22. dirty_it->second.state == ST_J_WAIT_BIG;
  23. }
  24. }
  25. if (!found)
  26. {
  27. auto clean_it = clean_db.find(op->oid);
  28. if (clean_it != clean_db.end())
  29. {
  30. version = clean_it->second.version + 1;
  31. }
  32. else
  33. {
  34. deleted = true;
  35. }
  36. }
  37. if (op->version == 0)
  38. {
  39. op->version = version;
  40. }
  41. else if (op->version < version)
  42. {
  43. // Invalid version requested
  44. op->retval = -EEXIST;
  45. return false;
  46. }
  47. if (deleted && is_del)
  48. {
  49. // Already deleted
  50. op->retval = 0;
  51. return false;
  52. }
  53. if (is_inflight_big && !is_del && !deleted && op->len < block_size &&
  54. immediate_commit != IMMEDIATE_ALL)
  55. {
  56. // Issue an additional sync so that the previous big write can reach the journal
  57. blockstore_op_t *sync_op = new blockstore_op_t;
  58. sync_op->opcode = BS_OP_SYNC;
  59. sync_op->callback = [this, op](blockstore_op_t *sync_op)
  60. {
  61. delete sync_op;
  62. };
  63. enqueue_op(sync_op);
  64. }
  65. #ifdef BLOCKSTORE_DEBUG
  66. if (is_del)
  67. printf("Delete %lu:%lu v%lu\n", op->oid.inode, op->oid.stripe, op->version);
  68. else
  69. printf("Write %lu:%lu v%lu offset=%u len=%u\n", op->oid.inode, op->oid.stripe, op->version, op->offset, op->len);
  70. #endif
  71. // No strict need to add it into dirty_db here, it's just left
  72. // from the previous implementation where reads waited for writes
  73. dirty_db.emplace((obj_ver_id){
  74. .oid = op->oid,
  75. .version = op->version,
  76. }, (dirty_entry){
  77. .state = (uint32_t)(
  78. is_del
  79. ? ST_DEL_IN_FLIGHT
  80. : (op->len == block_size || deleted ? ST_D_IN_FLIGHT : (is_inflight_big ? ST_J_WAIT_BIG : ST_J_IN_FLIGHT))
  81. ),
  82. .flags = 0,
  83. .location = 0,
  84. .offset = is_del ? 0 : op->offset,
  85. .len = is_del ? 0 : op->len,
  86. .journal_sector = 0,
  87. });
  88. return true;
  89. }
  90. // First step of the write algorithm: dequeue operation and submit initial write(s)
  91. int blockstore_impl_t::dequeue_write(blockstore_op_t *op)
  92. {
  93. if (PRIV(op)->op_state)
  94. {
  95. return continue_write(op);
  96. }
  97. auto dirty_it = dirty_db.find((obj_ver_id){
  98. .oid = op->oid,
  99. .version = op->version,
  100. });
  101. if (dirty_it->second.state == ST_J_WAIT_BIG)
  102. {
  103. return 0;
  104. }
  105. else if (dirty_it->second.state == ST_D_IN_FLIGHT)
  106. {
  107. blockstore_journal_check_t space_check(this);
  108. if (!space_check.check_available(op, unsynced_big_writes.size() + 1, sizeof(journal_entry_big_write), JOURNAL_STABILIZE_RESERVATION))
  109. {
  110. return 0;
  111. }
  112. // Big (redirect) write
  113. uint64_t loc = data_alloc->find_free();
  114. if (loc == UINT64_MAX)
  115. {
  116. // no space
  117. if (flusher->is_active())
  118. {
  119. // hope that some space will be available after flush
  120. PRIV(op)->wait_for = WAIT_FREE;
  121. return 0;
  122. }
  123. op->retval = -ENOSPC;
  124. FINISH_OP(op);
  125. return 1;
  126. }
  127. BS_SUBMIT_GET_SQE(sqe, data);
  128. dirty_it->second.location = loc << block_order;
  129. dirty_it->second.state = ST_D_SUBMITTED;
  130. #ifdef BLOCKSTORE_DEBUG
  131. printf("Allocate block %lu\n", loc);
  132. #endif
  133. data_alloc->set(loc, true);
  134. uint64_t stripe_offset = (op->offset % bitmap_granularity);
  135. uint64_t stripe_end = (op->offset + op->len) % bitmap_granularity;
  136. // Zero fill up to bitmap_granularity
  137. int vcnt = 0;
  138. if (stripe_offset)
  139. {
  140. PRIV(op)->iov_zerofill[vcnt++] = (struct iovec){ zero_object, stripe_offset };
  141. }
  142. PRIV(op)->iov_zerofill[vcnt++] = (struct iovec){ op->buf, op->len };
  143. if (stripe_end)
  144. {
  145. stripe_end = bitmap_granularity - stripe_end;
  146. PRIV(op)->iov_zerofill[vcnt++] = (struct iovec){ zero_object, stripe_end };
  147. }
  148. data->iov.iov_len = op->len + stripe_offset + stripe_end; // to check it in the callback
  149. data->callback = [this, op](ring_data_t *data) { handle_write_event(data, op); };
  150. my_uring_prep_writev(
  151. sqe, data_fd, PRIV(op)->iov_zerofill, vcnt, data_offset + (loc << block_order) + op->offset - stripe_offset
  152. );
  153. PRIV(op)->pending_ops = 1;
  154. PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 0;
  155. if (immediate_commit != IMMEDIATE_ALL)
  156. {
  157. // Remember big write as unsynced
  158. unsynced_big_writes.push_back((obj_ver_id){
  159. .oid = op->oid,
  160. .version = op->version,
  161. });
  162. PRIV(op)->op_state = 3;
  163. }
  164. else
  165. {
  166. PRIV(op)->op_state = 1;
  167. }
  168. }
  169. else
  170. {
  171. // Small (journaled) write
  172. // First check if the journal has sufficient space
  173. blockstore_journal_check_t space_check(this);
  174. if (unsynced_big_writes.size() && !space_check.check_available(op, unsynced_big_writes.size(), sizeof(journal_entry_big_write), 0)
  175. || !space_check.check_available(op, 1, sizeof(journal_entry_small_write), op->len + JOURNAL_STABILIZE_RESERVATION))
  176. {
  177. return 0;
  178. }
  179. // There is sufficient space. Get SQE(s)
  180. struct io_uring_sqe *sqe1 = NULL;
  181. if (immediate_commit != IMMEDIATE_NONE ||
  182. (journal_block_size - journal.in_sector_pos) < sizeof(journal_entry_small_write) &&
  183. journal.sector_info[journal.cur_sector].dirty)
  184. {
  185. // Write current journal sector only if it's dirty and full, or in the immediate_commit mode
  186. BS_SUBMIT_GET_SQE_DECL(sqe1);
  187. }
  188. struct io_uring_sqe *sqe2 = NULL;
  189. if (op->len > 0)
  190. {
  191. BS_SUBMIT_GET_SQE_DECL(sqe2);
  192. }
  193. // Got SQEs. Prepare previous journal sector write if required
  194. auto cb = [this, op](ring_data_t *data) { handle_write_event(data, op); };
  195. if (immediate_commit == IMMEDIATE_NONE)
  196. {
  197. if (sqe1)
  198. {
  199. prepare_journal_sector_write(journal, journal.cur_sector, sqe1, cb);
  200. PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 1 + journal.cur_sector;
  201. PRIV(op)->pending_ops++;
  202. }
  203. else
  204. {
  205. PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 0;
  206. }
  207. }
  208. // Then pre-fill journal entry
  209. journal_entry_small_write *je = (journal_entry_small_write*)
  210. prefill_single_journal_entry(journal, JE_SMALL_WRITE, sizeof(journal_entry_small_write));
  211. dirty_it->second.journal_sector = journal.sector_info[journal.cur_sector].offset;
  212. journal.used_sectors[journal.sector_info[journal.cur_sector].offset]++;
  213. #ifdef BLOCKSTORE_DEBUG
  214. printf("journal offset %lu is used by %lu:%lu v%lu\n", dirty_it->second.journal_sector, dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version);
  215. #endif
  216. // Figure out where data will be
  217. journal.next_free = (journal.next_free + op->len) <= journal.len ? journal.next_free : journal_block_size;
  218. je->oid = op->oid;
  219. je->version = op->version;
  220. je->offset = op->offset;
  221. je->len = op->len;
  222. je->data_offset = journal.next_free;
  223. je->crc32_data = crc32c(0, op->buf, op->len);
  224. je->crc32 = je_crc32((journal_entry*)je);
  225. journal.crc32_last = je->crc32;
  226. if (immediate_commit != IMMEDIATE_NONE)
  227. {
  228. prepare_journal_sector_write(journal, journal.cur_sector, sqe1, cb);
  229. PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 1 + journal.cur_sector;
  230. PRIV(op)->pending_ops++;
  231. }
  232. if (op->len > 0)
  233. {
  234. // Prepare journal data write
  235. if (journal.inmemory)
  236. {
  237. // Copy data
  238. memcpy(journal.buffer + journal.next_free, op->buf, op->len);
  239. }
  240. ring_data_t *data2 = ((ring_data_t*)sqe2->user_data);
  241. data2->iov = (struct iovec){ op->buf, op->len };
  242. data2->callback = cb;
  243. my_uring_prep_writev(
  244. sqe2, journal.fd, &data2->iov, 1, journal.offset + journal.next_free
  245. );
  246. PRIV(op)->pending_ops++;
  247. }
  248. else
  249. {
  250. // Zero-length overwrite. Allowed to bump object version in EC placement groups without actually writing data
  251. }
  252. dirty_it->second.location = journal.next_free;
  253. dirty_it->second.state = ST_J_SUBMITTED;
  254. journal.next_free += op->len;
  255. if (journal.next_free >= journal.len)
  256. {
  257. journal.next_free = journal_block_size;
  258. }
  259. if (immediate_commit == IMMEDIATE_NONE)
  260. {
  261. // Remember small write as unsynced
  262. unsynced_small_writes.push_back((obj_ver_id){
  263. .oid = op->oid,
  264. .version = op->version,
  265. });
  266. }
  267. if (!PRIV(op)->pending_ops)
  268. {
  269. PRIV(op)->op_state = 4;
  270. continue_write(op);
  271. }
  272. else
  273. {
  274. PRIV(op)->op_state = 3;
  275. }
  276. }
  277. inflight_writes++;
  278. return 1;
  279. }
  280. int blockstore_impl_t::continue_write(blockstore_op_t *op)
  281. {
  282. io_uring_sqe *sqe = NULL;
  283. journal_entry_big_write *je;
  284. auto dirty_it = dirty_db.find((obj_ver_id){
  285. .oid = op->oid,
  286. .version = op->version,
  287. });
  288. if (PRIV(op)->op_state == 2)
  289. goto resume_2;
  290. else if (PRIV(op)->op_state == 4)
  291. goto resume_4;
  292. else
  293. return 1;
  294. resume_2:
  295. // Only for the immediate_commit mode: prepare and submit big_write journal entry
  296. {
  297. timespec now;
  298. clock_gettime(CLOCK_REALTIME, &now);
  299. printf("get_sqe %s %d %ld.%06ld\n", __FILE__, __LINE__, now.tv_sec, now.tv_nsec/1000);
  300. }
  301. sqe = get_sqe();
  302. if (!sqe)
  303. {
  304. return 0;
  305. }
  306. je = (journal_entry_big_write*)prefill_single_journal_entry(journal, JE_BIG_WRITE, sizeof(journal_entry_big_write));
  307. dirty_it->second.journal_sector = journal.sector_info[journal.cur_sector].offset;
  308. journal.sector_info[journal.cur_sector].dirty = false;
  309. journal.used_sectors[journal.sector_info[journal.cur_sector].offset]++;
  310. #ifdef BLOCKSTORE_DEBUG
  311. printf("journal offset %lu is used by %lu:%lu v%lu\n", journal.sector_info[journal.cur_sector].offset, op->oid.inode, op->oid.stripe, op->version);
  312. #endif
  313. je->oid = op->oid;
  314. je->version = op->version;
  315. je->offset = op->offset;
  316. je->len = op->len;
  317. je->location = dirty_it->second.location;
  318. je->crc32 = je_crc32((journal_entry*)je);
  319. journal.crc32_last = je->crc32;
  320. prepare_journal_sector_write(journal, journal.cur_sector, sqe,
  321. [this, op](ring_data_t *data) { handle_write_event(data, op); });
  322. PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 1 + journal.cur_sector;
  323. PRIV(op)->pending_ops = 1;
  324. PRIV(op)->op_state = 3;
  325. return 1;
  326. resume_4:
  327. // Switch object state
  328. {
  329. timespec now;
  330. clock_gettime(CLOCK_REALTIME, &now);
  331. printf("write_done %s %d %ld.%06ld\n", __FILE__, __LINE__, now.tv_sec, now.tv_nsec/1000);
  332. }
  333. #ifdef BLOCKSTORE_DEBUG
  334. printf("Ack write %lu:%lu v%lu = %d\n", op->oid.inode, op->oid.stripe, op->version, dirty_it->second.state);
  335. #endif
  336. bool imm = dirty_it->second.state == ST_D_SUBMITTED
  337. ? (immediate_commit == IMMEDIATE_ALL)
  338. : (immediate_commit != IMMEDIATE_NONE);
  339. if (imm)
  340. {
  341. auto & unstab = unstable_writes[op->oid];
  342. unstab = unstab < op->version ? op->version : unstab;
  343. }
  344. if (dirty_it->second.state == ST_J_SUBMITTED)
  345. {
  346. dirty_it->second.state = imm ? ST_J_SYNCED : ST_J_WRITTEN;
  347. }
  348. else if (dirty_it->second.state == ST_D_SUBMITTED)
  349. {
  350. dirty_it->second.state = imm ? ST_D_SYNCED : ST_D_WRITTEN;
  351. }
  352. else if (dirty_it->second.state == ST_DEL_SUBMITTED)
  353. {
  354. dirty_it->second.state = imm ? ST_DEL_SYNCED : ST_DEL_WRITTEN;
  355. }
  356. if (immediate_commit == IMMEDIATE_ALL)
  357. {
  358. dirty_it++;
  359. while (dirty_it != dirty_db.end() && dirty_it->first.oid == op->oid)
  360. {
  361. if (dirty_it->second.state == ST_J_WAIT_BIG)
  362. {
  363. dirty_it->second.state = ST_J_IN_FLIGHT;
  364. }
  365. dirty_it++;
  366. }
  367. }
  368. inflight_writes--;
  369. // Acknowledge write
  370. op->retval = op->len;
  371. FINISH_OP(op);
  372. return 1;
  373. }
  374. void blockstore_impl_t::handle_write_event(ring_data_t *data, blockstore_op_t *op)
  375. {
  376. live = true;
  377. if (data->res != data->iov.iov_len)
  378. {
  379. inflight_writes--;
  380. // FIXME: our state becomes corrupted after a write error. maybe do something better than just die
  381. throw std::runtime_error(
  382. "write operation failed ("+std::to_string(data->res)+" != "+std::to_string(data->iov.iov_len)+
  383. "). in-memory state is corrupted. AAAAAAAaaaaaaaaa!!!111"
  384. );
  385. }
  386. PRIV(op)->pending_ops--;
  387. if (PRIV(op)->pending_ops == 0)
  388. {
  389. release_journal_sectors(op);
  390. PRIV(op)->op_state++;
  391. if (!continue_write(op))
  392. {
  393. submit_queue.push_front(op);
  394. }
  395. }
  396. }
  397. void blockstore_impl_t::release_journal_sectors(blockstore_op_t *op)
  398. {
  399. // Release flushed journal sectors
  400. if (PRIV(op)->min_flushed_journal_sector > 0 &&
  401. PRIV(op)->max_flushed_journal_sector > 0)
  402. {
  403. uint64_t s = PRIV(op)->min_flushed_journal_sector;
  404. while (1)
  405. {
  406. journal.sector_info[s-1].usage_count--;
  407. if (s != (1+journal.cur_sector) && journal.sector_info[s-1].usage_count == 0)
  408. {
  409. // We know for sure that we won't write into this sector anymore
  410. uint64_t new_ds = journal.sector_info[s-1].offset + journal.block_size;
  411. if ((journal.dirty_start + (journal.dirty_start >= journal.used_start ? 0 : journal.len)) <
  412. (new_ds + (new_ds >= journal.used_start ? 0 : journal.len)))
  413. {
  414. journal.dirty_start = new_ds;
  415. }
  416. }
  417. if (s == PRIV(op)->max_flushed_journal_sector)
  418. break;
  419. s = 1 + s % journal.sector_count;
  420. }
  421. PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 0;
  422. }
  423. }
  424. int blockstore_impl_t::dequeue_del(blockstore_op_t *op)
  425. {
  426. auto dirty_it = dirty_db.find((obj_ver_id){
  427. .oid = op->oid,
  428. .version = op->version,
  429. });
  430. blockstore_journal_check_t space_check(this);
  431. if (!space_check.check_available(op, 1, sizeof(journal_entry_del), 0))
  432. {
  433. return 0;
  434. }
  435. io_uring_sqe *sqe = NULL;
  436. if (immediate_commit != IMMEDIATE_NONE ||
  437. (journal_block_size - journal.in_sector_pos) < sizeof(journal_entry_del) &&
  438. journal.sector_info[journal.cur_sector].dirty)
  439. {
  440. // Write current journal sector only if it's dirty and full, or in the immediate_commit mode
  441. BS_SUBMIT_GET_SQE_DECL(sqe);
  442. }
  443. auto cb = [this, op](ring_data_t *data) { handle_write_event(data, op); };
  444. // Prepare journal sector write
  445. if (immediate_commit == IMMEDIATE_NONE)
  446. {
  447. if (sqe)
  448. {
  449. prepare_journal_sector_write(journal, journal.cur_sector, sqe, cb);
  450. PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 1 + journal.cur_sector;
  451. PRIV(op)->pending_ops++;
  452. }
  453. else
  454. {
  455. PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 0;
  456. }
  457. }
  458. // Pre-fill journal entry
  459. journal_entry_del *je = (journal_entry_del*)
  460. prefill_single_journal_entry(journal, JE_DELETE, sizeof(struct journal_entry_del));
  461. dirty_it->second.journal_sector = journal.sector_info[journal.cur_sector].offset;
  462. journal.used_sectors[journal.sector_info[journal.cur_sector].offset]++;
  463. #ifdef BLOCKSTORE_DEBUG
  464. printf("journal offset %lu is used by %lu:%lu v%lu\n", dirty_it->second.journal_sector, dirty_it->first.oid.inode, dirty_it->first.oid.stripe, dirty_it->first.version);
  465. #endif
  466. je->oid = op->oid;
  467. je->version = op->version;
  468. je->crc32 = je_crc32((journal_entry*)je);
  469. journal.crc32_last = je->crc32;
  470. dirty_it->second.state = ST_DEL_SUBMITTED;
  471. if (immediate_commit != IMMEDIATE_NONE)
  472. {
  473. prepare_journal_sector_write(journal, journal.cur_sector, sqe, cb);
  474. PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 1 + journal.cur_sector;
  475. PRIV(op)->pending_ops++;
  476. // Remember small write as unsynced
  477. unsynced_small_writes.push_back((obj_ver_id){
  478. .oid = op->oid,
  479. .version = op->version,
  480. });
  481. }
  482. if (!PRIV(op)->pending_ops)
  483. {
  484. PRIV(op)->op_state = 4;
  485. continue_write(op);
  486. }
  487. else
  488. {
  489. PRIV(op)->op_state = 3;
  490. }
  491. return 1;
  492. }