Simplified distributed block storage with strong consistency, like in Ceph
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

453 lines
13 KiB

  1. #include "blockstore_impl.h"
  2. blockstore_impl_t::blockstore_impl_t(blockstore_config_t & config, ring_loop_t *ringloop)
  3. {
  4. assert(sizeof(blockstore_op_private_t) <= BS_OP_PRIVATE_DATA_SIZE);
  5. this->ringloop = ringloop;
  6. ring_consumer.loop = [this]() { loop(); };
  7. ringloop->register_consumer(ring_consumer);
  8. initialized = 0;
  9. zero_object = (uint8_t*)memalign(MEM_ALIGNMENT, block_size);
  10. data_fd = meta_fd = journal.fd = -1;
  11. parse_config(config);
  12. try
  13. {
  14. open_data();
  15. open_meta();
  16. open_journal();
  17. calc_lengths();
  18. data_alloc = new allocator(block_count);
  19. }
  20. catch (std::exception & e)
  21. {
  22. if (data_fd >= 0)
  23. close(data_fd);
  24. if (meta_fd >= 0 && meta_fd != data_fd)
  25. close(meta_fd);
  26. if (journal.fd >= 0 && journal.fd != meta_fd)
  27. close(journal.fd);
  28. throw;
  29. }
  30. flusher = new journal_flusher_t(flusher_count, this);
  31. }
  32. blockstore_impl_t::~blockstore_impl_t()
  33. {
  34. delete data_alloc;
  35. delete flusher;
  36. free(zero_object);
  37. ringloop->unregister_consumer(ring_consumer);
  38. if (data_fd >= 0)
  39. close(data_fd);
  40. if (meta_fd >= 0 && meta_fd != data_fd)
  41. close(meta_fd);
  42. if (journal.fd >= 0 && journal.fd != meta_fd)
  43. close(journal.fd);
  44. if (metadata_buffer)
  45. free(metadata_buffer);
  46. if (clean_bitmap)
  47. free(clean_bitmap);
  48. }
  49. bool blockstore_impl_t::is_started()
  50. {
  51. return initialized == 10;
  52. }
  53. bool blockstore_impl_t::is_stalled()
  54. {
  55. return queue_stall;
  56. }
  57. // main event loop - produce requests
  58. void blockstore_impl_t::loop()
  59. {
  60. // FIXME: initialized == 10 is ugly
  61. if (initialized != 10)
  62. {
  63. // read metadata, then journal
  64. if (initialized == 0)
  65. {
  66. metadata_init_reader = new blockstore_init_meta(this);
  67. initialized = 1;
  68. }
  69. if (initialized == 1)
  70. {
  71. int res = metadata_init_reader->loop();
  72. if (!res)
  73. {
  74. delete metadata_init_reader;
  75. metadata_init_reader = NULL;
  76. journal_init_reader = new blockstore_init_journal(this);
  77. initialized = 2;
  78. }
  79. }
  80. if (initialized == 2)
  81. {
  82. int res = journal_init_reader->loop();
  83. if (!res)
  84. {
  85. delete journal_init_reader;
  86. journal_init_reader = NULL;
  87. initialized = 10;
  88. ringloop->wakeup();
  89. }
  90. }
  91. }
  92. else
  93. {
  94. // try to submit ops
  95. unsigned initial_ring_space = ringloop->space_left();
  96. auto cur_sync = in_progress_syncs.begin();
  97. while (cur_sync != in_progress_syncs.end())
  98. {
  99. continue_sync(*cur_sync++);
  100. }
  101. auto cur = submit_queue.begin();
  102. int has_writes = 0;
  103. while (cur != submit_queue.end())
  104. {
  105. auto op_ptr = cur;
  106. auto op = *(cur++);
  107. // FIXME: This needs some simplification
  108. // Writes should not block reads if the ring is not full and reads don't depend on them
  109. // In all other cases we should stop submission
  110. if (PRIV(op)->wait_for)
  111. {
  112. check_wait(op);
  113. #ifdef BLOCKSTORE_DEBUG
  114. if (PRIV(op)->wait_for)
  115. {
  116. printf("still waiting for %d\n", PRIV(op)->wait_for);
  117. }
  118. #endif
  119. if (PRIV(op)->wait_for == WAIT_SQE)
  120. {
  121. break;
  122. }
  123. else if (PRIV(op)->wait_for)
  124. {
  125. if (op->opcode == BS_OP_WRITE || op->opcode == BS_OP_DELETE)
  126. {
  127. has_writes = 2;
  128. }
  129. continue;
  130. }
  131. }
  132. unsigned ring_space = ringloop->space_left();
  133. unsigned prev_sqe_pos = ringloop->save();
  134. int dequeue_op = 0;
  135. if (op->opcode == BS_OP_READ)
  136. {
  137. dequeue_op = dequeue_read(op);
  138. }
  139. else if (op->opcode == BS_OP_WRITE || op->opcode == BS_OP_DELETE)
  140. {
  141. if (has_writes == 2)
  142. {
  143. // Some writes could not be submitted
  144. break;
  145. }
  146. dequeue_op = dequeue_write(op);
  147. has_writes = dequeue_op ? 1 : 2;
  148. }
  149. else if (op->opcode == BS_OP_SYNC)
  150. {
  151. // wait for all small writes to be submitted
  152. // wait for all big writes to complete, submit data device fsync
  153. // wait for the data device fsync to complete, then submit journal writes for big writes
  154. // then submit an fsync operation
  155. if (has_writes)
  156. {
  157. // Can't submit SYNC before previous writes
  158. continue;
  159. }
  160. dequeue_op = dequeue_sync(op);
  161. }
  162. else if (op->opcode == BS_OP_STABLE)
  163. {
  164. dequeue_op = dequeue_stable(op);
  165. }
  166. else if (op->opcode == BS_OP_ROLLBACK)
  167. {
  168. dequeue_op = dequeue_rollback(op);
  169. }
  170. else if (op->opcode == BS_OP_LIST)
  171. {
  172. process_list(op);
  173. dequeue_op = true;
  174. }
  175. if (dequeue_op)
  176. {
  177. submit_queue.erase(op_ptr);
  178. }
  179. else
  180. {
  181. ringloop->restore(prev_sqe_pos);
  182. if (PRIV(op)->wait_for == WAIT_SQE)
  183. {
  184. PRIV(op)->wait_detail = 1 + ring_space;
  185. // ring is full, stop submission
  186. break;
  187. }
  188. }
  189. }
  190. if (!readonly)
  191. {
  192. flusher->loop();
  193. }
  194. if ((initial_ring_space - ringloop->space_left()) > 0)
  195. {
  196. live = true;
  197. }
  198. queue_stall = !live && !ringloop->get_loop_again();
  199. live = false;
  200. }
  201. }
  202. bool blockstore_impl_t::is_safe_to_stop()
  203. {
  204. // It's safe to stop blockstore when there are no in-flight operations,
  205. // no in-progress syncs and flusher isn't doing anything
  206. if (submit_queue.size() > 0 || in_progress_syncs.size() > 0 || !readonly && flusher->is_active())
  207. {
  208. return false;
  209. }
  210. if (unsynced_big_writes.size() > 0 || unsynced_small_writes.size() > 0)
  211. {
  212. if (!readonly && !stop_sync_submitted)
  213. {
  214. // We should sync the blockstore before unmounting
  215. blockstore_op_t *op = new blockstore_op_t;
  216. op->opcode = BS_OP_SYNC;
  217. op->buf = NULL;
  218. op->callback = [](blockstore_op_t *op)
  219. {
  220. delete op;
  221. };
  222. enqueue_op(op);
  223. stop_sync_submitted = true;
  224. }
  225. return false;
  226. }
  227. return true;
  228. }
  229. void blockstore_impl_t::check_wait(blockstore_op_t *op)
  230. {
  231. if (PRIV(op)->wait_for == WAIT_SQE)
  232. {
  233. if (ringloop->space_left() < PRIV(op)->wait_detail)
  234. {
  235. // stop submission if there's still no free space
  236. return;
  237. }
  238. PRIV(op)->wait_for = 0;
  239. }
  240. else if (PRIV(op)->wait_for == WAIT_IN_FLIGHT)
  241. {
  242. auto dirty_it = dirty_db.find((obj_ver_id){
  243. .oid = op->oid,
  244. .version = PRIV(op)->wait_detail,
  245. });
  246. if (dirty_it != dirty_db.end() && IS_IN_FLIGHT(dirty_it->second.state))
  247. {
  248. // do not submit
  249. return;
  250. }
  251. PRIV(op)->wait_for = 0;
  252. }
  253. else if (PRIV(op)->wait_for == WAIT_JOURNAL)
  254. {
  255. if (journal.used_start == PRIV(op)->wait_detail)
  256. {
  257. // do not submit
  258. return;
  259. }
  260. PRIV(op)->wait_for = 0;
  261. }
  262. else if (PRIV(op)->wait_for == WAIT_JOURNAL_BUFFER)
  263. {
  264. int next = ((journal.cur_sector + 1) % journal.sector_count);
  265. if (journal.sector_info[next].usage_count > 0 ||
  266. journal.sector_info[next].dirty)
  267. {
  268. // do not submit
  269. return;
  270. }
  271. PRIV(op)->wait_for = 0;
  272. }
  273. else if (PRIV(op)->wait_for == WAIT_FREE)
  274. {
  275. if (!data_alloc->get_free_count() && !flusher->is_active())
  276. {
  277. return;
  278. }
  279. PRIV(op)->wait_for = 0;
  280. }
  281. else
  282. {
  283. throw std::runtime_error("BUG: op->wait_for value is unexpected");
  284. }
  285. }
  286. void blockstore_impl_t::enqueue_op(blockstore_op_t *op, bool first)
  287. {
  288. if (op->opcode < BS_OP_MIN || op->opcode > BS_OP_MAX ||
  289. ((op->opcode == BS_OP_READ || op->opcode == BS_OP_WRITE) && (
  290. op->offset >= block_size ||
  291. op->len > block_size-op->offset ||
  292. (op->len % disk_alignment)
  293. )) ||
  294. readonly && op->opcode != BS_OP_READ ||
  295. first && op->opcode == BS_OP_WRITE)
  296. {
  297. // Basic verification not passed
  298. op->retval = -EINVAL;
  299. op->callback(op);
  300. return;
  301. }
  302. if (op->opcode == BS_OP_SYNC_STAB_ALL)
  303. {
  304. std::function<void(blockstore_op_t*)> *old_callback = new std::function<void(blockstore_op_t*)>(op->callback);
  305. op->opcode = BS_OP_SYNC;
  306. op->callback = [this, old_callback](blockstore_op_t *op)
  307. {
  308. if (op->retval >= 0 && unstable_writes.size() > 0)
  309. {
  310. op->opcode = BS_OP_STABLE;
  311. op->len = unstable_writes.size();
  312. obj_ver_id *vers = new obj_ver_id[op->len];
  313. op->buf = vers;
  314. int i = 0;
  315. for (auto it = unstable_writes.begin(); it != unstable_writes.end(); it++, i++)
  316. {
  317. vers[i] = {
  318. .oid = it->first,
  319. .version = it->second,
  320. };
  321. }
  322. unstable_writes.clear();
  323. op->callback = [this, old_callback](blockstore_op_t *op)
  324. {
  325. obj_ver_id *vers = (obj_ver_id*)op->buf;
  326. delete[] vers;
  327. op->buf = NULL;
  328. (*old_callback)(op);
  329. delete old_callback;
  330. };
  331. this->enqueue_op(op);
  332. }
  333. else
  334. {
  335. (*old_callback)(op);
  336. delete old_callback;
  337. }
  338. };
  339. }
  340. if (op->opcode == BS_OP_WRITE && !enqueue_write(op))
  341. {
  342. op->callback(op);
  343. return;
  344. }
  345. if (0 && op->opcode == BS_OP_SYNC && immediate_commit)
  346. {
  347. op->retval = 0;
  348. op->callback(op);
  349. return;
  350. }
  351. // Call constructor without allocating memory. We'll call destructor before returning op back
  352. new ((void*)op->private_data) blockstore_op_private_t;
  353. PRIV(op)->wait_for = 0;
  354. PRIV(op)->sync_state = 0;
  355. PRIV(op)->pending_ops = 0;
  356. if (!first)
  357. {
  358. submit_queue.push_back(op);
  359. }
  360. else
  361. {
  362. submit_queue.push_front(op);
  363. }
  364. ringloop->wakeup();
  365. }
  366. void blockstore_impl_t::process_list(blockstore_op_t *op)
  367. {
  368. // Count objects
  369. uint32_t list_pg = op->offset;
  370. uint32_t pg_count = op->len;
  371. uint64_t parity_block_size = op->oid.stripe;
  372. if (pg_count != 0 && (parity_block_size < MIN_BLOCK_SIZE || list_pg >= pg_count))
  373. {
  374. op->retval = -EINVAL;
  375. FINISH_OP(op);
  376. return;
  377. }
  378. uint64_t stable_count = 0;
  379. if (pg_count > 0)
  380. {
  381. for (auto it = clean_db.begin(); it != clean_db.end(); it++)
  382. {
  383. uint32_t pg = (it->first.inode + it->first.stripe / parity_block_size) % pg_count;
  384. if (pg == list_pg)
  385. {
  386. stable_count++;
  387. }
  388. }
  389. }
  390. else
  391. {
  392. stable_count = clean_db.size();
  393. }
  394. uint64_t total_count = stable_count;
  395. for (auto it = dirty_db.begin(); it != dirty_db.end(); it++)
  396. {
  397. if (!pg_count || ((it->first.oid.inode + it->first.oid.stripe / parity_block_size) % pg_count) == list_pg)
  398. {
  399. if (IS_STABLE(it->second.state))
  400. {
  401. stable_count++;
  402. }
  403. total_count++;
  404. }
  405. }
  406. // Allocate memory
  407. op->version = stable_count;
  408. op->retval = total_count;
  409. op->buf = malloc(sizeof(obj_ver_id) * total_count);
  410. if (!op->buf)
  411. {
  412. op->retval = -ENOMEM;
  413. FINISH_OP(op);
  414. return;
  415. }
  416. obj_ver_id *vers = (obj_ver_id*)op->buf;
  417. int i = 0;
  418. for (auto it = clean_db.begin(); it != clean_db.end(); it++)
  419. {
  420. if (!pg_count || ((it->first.inode + it->first.stripe / parity_block_size) % pg_count) == list_pg)
  421. {
  422. vers[i++] = {
  423. .oid = it->first,
  424. .version = it->second.version,
  425. };
  426. }
  427. }
  428. int j = stable_count;
  429. for (auto it = dirty_db.begin(); it != dirty_db.end(); it++)
  430. {
  431. if (!pg_count || ((it->first.oid.inode + it->first.oid.stripe / parity_block_size) % pg_count) == list_pg)
  432. {
  433. if (IS_STABLE(it->second.state))
  434. {
  435. vers[i++] = it->first;
  436. }
  437. else
  438. {
  439. vers[j++] = it->first;
  440. }
  441. }
  442. }
  443. FINISH_OP(op);
  444. }