Simplified distributed block storage with strong consistency, like in Ceph
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

271 lines
9.9 KiB

2 years ago
  1. #include "blockstore_impl.h"
  2. #define SYNC_HAS_SMALL 1
  3. #define SYNC_HAS_BIG 2
  4. #define SYNC_DATA_SYNC_SENT 3
  5. #define SYNC_DATA_SYNC_DONE 4
  6. #define SYNC_JOURNAL_WRITE_SENT 5
  7. #define SYNC_JOURNAL_WRITE_DONE 6
  8. #define SYNC_JOURNAL_SYNC_SENT 7
  9. #define SYNC_DONE 8
  10. int blockstore_impl_t::dequeue_sync(blockstore_op_t *op)
  11. {
  12. if (PRIV(op)->sync_state == 0)
  13. {
  14. stop_sync_submitted = false;
  15. PRIV(op)->sync_big_writes.swap(unsynced_big_writes);
  16. PRIV(op)->sync_small_writes.swap(unsynced_small_writes);
  17. PRIV(op)->sync_small_checked = 0;
  18. PRIV(op)->sync_big_checked = 0;
  19. unsynced_big_writes.clear();
  20. unsynced_small_writes.clear();
  21. if (PRIV(op)->sync_big_writes.size() > 0)
  22. PRIV(op)->sync_state = SYNC_HAS_BIG;
  23. else if (PRIV(op)->sync_small_writes.size() > 0)
  24. PRIV(op)->sync_state = SYNC_HAS_SMALL;
  25. else
  26. PRIV(op)->sync_state = SYNC_DONE;
  27. // Always add sync to in_progress_syncs because we clear unsynced_big_writes and unsynced_small_writes
  28. PRIV(op)->prev_sync_count = in_progress_syncs.size();
  29. PRIV(op)->in_progress_ptr = in_progress_syncs.insert(in_progress_syncs.end(), op);
  30. }
  31. continue_sync(op);
  32. // Always dequeue because we always add syncs to in_progress_syncs
  33. return 1;
  34. }
  35. int blockstore_impl_t::continue_sync(blockstore_op_t *op)
  36. {
  37. auto cb = [this, op](ring_data_t *data) { handle_sync_event(data, op); };
  38. if (PRIV(op)->sync_state == SYNC_HAS_SMALL)
  39. {
  40. // No big writes, just fsync the journal
  41. for (; PRIV(op)->sync_small_checked < PRIV(op)->sync_small_writes.size(); PRIV(op)->sync_small_checked++)
  42. {
  43. if (IS_IN_FLIGHT(dirty_db[PRIV(op)->sync_small_writes[PRIV(op)->sync_small_checked]].state))
  44. {
  45. // Wait for small inflight writes to complete
  46. return 0;
  47. }
  48. }
  49. if (journal.sector_info[journal.cur_sector].dirty)
  50. {
  51. // Write out the last journal sector if it happens to be dirty
  52. BS_SUBMIT_GET_ONLY_SQE(sqe);
  53. prepare_journal_sector_write(journal, journal.cur_sector, sqe, cb);
  54. PRIV(op)->min_used_journal_sector = PRIV(op)->max_used_journal_sector = 1 + journal.cur_sector;
  55. PRIV(op)->pending_ops = 1;
  56. PRIV(op)->sync_state = SYNC_JOURNAL_WRITE_SENT;
  57. return 1;
  58. }
  59. else
  60. {
  61. PRIV(op)->sync_state = SYNC_JOURNAL_WRITE_DONE;
  62. }
  63. }
  64. if (PRIV(op)->sync_state == SYNC_HAS_BIG)
  65. {
  66. for (; PRIV(op)->sync_big_checked < PRIV(op)->sync_big_writes.size(); PRIV(op)->sync_big_checked++)
  67. {
  68. if (IS_IN_FLIGHT(dirty_db[PRIV(op)->sync_big_writes[PRIV(op)->sync_big_checked]].state))
  69. {
  70. // Wait for big inflight writes to complete
  71. return 0;
  72. }
  73. }
  74. // 1st step: fsync data
  75. if (!disable_data_fsync)
  76. {
  77. BS_SUBMIT_GET_SQE(sqe, data);
  78. my_uring_prep_fsync(sqe, data_fd_index, IORING_FSYNC_DATASYNC);
  79. sqe->flags |= IOSQE_FIXED_FILE;
  80. data->iov = { 0 };
  81. data->callback = cb;
  82. PRIV(op)->min_used_journal_sector = PRIV(op)->max_used_journal_sector = 0;
  83. PRIV(op)->pending_ops = 1;
  84. PRIV(op)->sync_state = SYNC_DATA_SYNC_SENT;
  85. return 1;
  86. }
  87. else
  88. {
  89. PRIV(op)->sync_state = SYNC_DATA_SYNC_DONE;
  90. }
  91. }
  92. if (PRIV(op)->sync_state == SYNC_DATA_SYNC_DONE)
  93. {
  94. for (; PRIV(op)->sync_small_checked < PRIV(op)->sync_small_writes.size(); PRIV(op)->sync_small_checked++)
  95. {
  96. if (IS_IN_FLIGHT(dirty_db[PRIV(op)->sync_small_writes[PRIV(op)->sync_small_checked]].state))
  97. {
  98. // Wait for small inflight writes to complete
  99. return 0;
  100. }
  101. }
  102. // 2nd step: Data device is synced, prepare & write journal entries
  103. // Check space in the journal and journal memory buffers
  104. blockstore_journal_check_t space_check(this);
  105. if (!space_check.check_available(op, PRIV(op)->sync_big_writes.size(), sizeof(journal_entry_big_write), 0))
  106. {
  107. return 0;
  108. }
  109. // Get SQEs. Don't bother about merging, submit each journal sector as a separate request
  110. struct io_uring_sqe *sqe[space_check.sectors_required];
  111. for (int i = 0; i < space_check.sectors_required; i++)
  112. {
  113. BS_SUBMIT_GET_SQE_DECL(sqe[i]);
  114. }
  115. // Prepare and submit journal entries
  116. auto it = PRIV(op)->sync_big_writes.begin();
  117. int s = 0, cur_sector = -1;
  118. if ((journal_block_size - journal.in_sector_pos) < sizeof(journal_entry_big_write) &&
  119. journal.sector_info[journal.cur_sector].dirty)
  120. {
  121. if (cur_sector == -1)
  122. PRIV(op)->min_used_journal_sector = 1 + journal.cur_sector;
  123. cur_sector = journal.cur_sector;
  124. prepare_journal_sector_write(journal, cur_sector, sqe[s++], cb);
  125. }
  126. while (it != PRIV(op)->sync_big_writes.end())
  127. {
  128. journal_entry_big_write *je = (journal_entry_big_write*)
  129. prefill_single_journal_entry(journal, JE_BIG_WRITE, sizeof(journal_entry_big_write));
  130. dirty_db[*it].journal_sector = journal.sector_info[journal.cur_sector].offset;
  131. journal.sector_info[journal.cur_sector].dirty = false;
  132. journal.used_sectors[journal.sector_info[journal.cur_sector].offset]++;
  133. #ifdef BLOCKSTORE_DEBUG
  134. printf("journal offset %lu is used by %lu:%lu v%lu\n", dirty_db[*it].journal_sector, it->oid.inode, it->oid.stripe, it->version);
  135. #endif
  136. je->oid = it->oid;
  137. je->version = it->version;
  138. je->offset = dirty_db[*it].offset;
  139. je->len = dirty_db[*it].len;
  140. je->location = dirty_db[*it].location;
  141. je->crc32 = je_crc32((journal_entry*)je);
  142. journal.crc32_last = je->crc32;
  143. it++;
  144. if (cur_sector != journal.cur_sector)
  145. {
  146. if (cur_sector == -1)
  147. PRIV(op)->min_used_journal_sector = 1 + journal.cur_sector;
  148. cur_sector = journal.cur_sector;
  149. prepare_journal_sector_write(journal, cur_sector, sqe[s++], cb);
  150. }
  151. }
  152. PRIV(op)->max_used_journal_sector = 1 + journal.cur_sector;
  153. PRIV(op)->pending_ops = s;
  154. PRIV(op)->sync_state = SYNC_JOURNAL_WRITE_SENT;
  155. return 1;
  156. }
  157. if (PRIV(op)->sync_state == SYNC_JOURNAL_WRITE_DONE)
  158. {
  159. if (!disable_journal_fsync)
  160. {
  161. BS_SUBMIT_GET_SQE(sqe, data);
  162. my_uring_prep_fsync(sqe, journal_fd_index, IORING_FSYNC_DATASYNC);
  163. sqe->flags |= IOSQE_FIXED_FILE;
  164. data->iov = { 0 };
  165. data->callback = cb;
  166. PRIV(op)->pending_ops = 1;
  167. PRIV(op)->sync_state = SYNC_JOURNAL_SYNC_SENT;
  168. return 1;
  169. }
  170. else
  171. {
  172. PRIV(op)->sync_state = SYNC_DONE;
  173. }
  174. }
  175. if (PRIV(op)->sync_state == SYNC_DONE)
  176. {
  177. ack_sync(op);
  178. }
  179. return 1;
  180. }
  181. void blockstore_impl_t::handle_sync_event(ring_data_t *data, blockstore_op_t *op)
  182. {
  183. live = true;
  184. if (data->res != data->iov.iov_len)
  185. {
  186. throw std::runtime_error(
  187. "write operation failed ("+std::to_string(data->res)+" != "+std::to_string(data->iov.iov_len)+
  188. "). in-memory state is corrupted. AAAAAAAaaaaaaaaa!!!111"
  189. );
  190. }
  191. PRIV(op)->pending_ops--;
  192. if (PRIV(op)->pending_ops == 0)
  193. {
  194. // Release used journal sectors
  195. release_journal_sectors(op);
  196. // Handle states
  197. if (PRIV(op)->sync_state == SYNC_DATA_SYNC_SENT)
  198. {
  199. PRIV(op)->sync_state = SYNC_DATA_SYNC_DONE;
  200. }
  201. else if (PRIV(op)->sync_state == SYNC_JOURNAL_WRITE_SENT)
  202. {
  203. PRIV(op)->sync_state = SYNC_JOURNAL_WRITE_DONE;
  204. }
  205. else if (PRIV(op)->sync_state == SYNC_JOURNAL_SYNC_SENT)
  206. {
  207. PRIV(op)->sync_state = SYNC_DONE;
  208. ack_sync(op);
  209. }
  210. else
  211. {
  212. throw std::runtime_error("BUG: unexpected sync op state");
  213. }
  214. }
  215. }
  216. int blockstore_impl_t::ack_sync(blockstore_op_t *op)
  217. {
  218. if (PRIV(op)->sync_state == SYNC_DONE && PRIV(op)->prev_sync_count == 0)
  219. {
  220. // Remove dependency of subsequent syncs
  221. auto it = PRIV(op)->in_progress_ptr;
  222. int done_syncs = 1;
  223. ++it;
  224. // Acknowledge sync
  225. ack_one_sync(op);
  226. while (it != in_progress_syncs.end())
  227. {
  228. auto & next_sync = *it++;
  229. PRIV(next_sync)->prev_sync_count -= done_syncs;
  230. if (PRIV(next_sync)->prev_sync_count == 0 && PRIV(next_sync)->sync_state == SYNC_DONE)
  231. {
  232. done_syncs++;
  233. // Acknowledge next_sync
  234. ack_one_sync(next_sync);
  235. }
  236. }
  237. return 1;
  238. }
  239. return 0;
  240. }
  241. void blockstore_impl_t::ack_one_sync(blockstore_op_t *op)
  242. {
  243. // Handle states
  244. for (auto it = PRIV(op)->sync_big_writes.begin(); it != PRIV(op)->sync_big_writes.end(); it++)
  245. {
  246. #ifdef BLOCKSTORE_DEBUG
  247. printf("Ack sync big %lu:%lu v%lu\n", it->oid.inode, it->oid.stripe, it->version);
  248. #endif
  249. auto & unstab = unstable_writes[it->oid];
  250. unstab = unstab < it->version ? it->version : unstab;
  251. dirty_db[*it].state = ST_D_META_SYNCED;
  252. }
  253. for (auto it = PRIV(op)->sync_small_writes.begin(); it != PRIV(op)->sync_small_writes.end(); it++)
  254. {
  255. #ifdef BLOCKSTORE_DEBUG
  256. printf("Ack sync small %lu:%lu v%lu\n", it->oid.inode, it->oid.stripe, it->version);
  257. #endif
  258. auto & unstab = unstable_writes[it->oid];
  259. unstab = unstab < it->version ? it->version : unstab;
  260. dirty_db[*it].state = dirty_db[*it].state == ST_DEL_WRITTEN ? ST_DEL_SYNCED : ST_J_SYNCED;
  261. }
  262. in_progress_syncs.erase(PRIV(op)->in_progress_ptr);
  263. op->retval = 0;
  264. FINISH_OP(op);
  265. }