From f4769ba7c76423b8ff6daefaf5332065f993f917 Mon Sep 17 00:00:00 2001 From: Vitaliy Filippov Date: Sat, 27 Mar 2021 23:21:18 +0300 Subject: [PATCH] Collapse create+delete journal entry pairs if they're already flushed Old journal replay mechanism could lead to a double allocation of the same block and a "Fatal error: tried to overwrite non-zero metadata entry" --- src/blockstore_init.cpp | 71 ++++++++++++++++++++++++----------------- src/blockstore_init.h | 1 + 2 files changed, 43 insertions(+), 29 deletions(-) diff --git a/src/blockstore_init.cpp b/src/blockstore_init.cpp index 4f6a238c..5faf4c2d 100644 --- a/src/blockstore_init.cpp +++ b/src/blockstore_init.cpp @@ -582,32 +582,10 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u // its data and metadata are already flushed. // We don't know if newer versions are flushed, but // the previous delete definitely is. - // So we flush previous dirty entries, but retain the clean one. + // So we forget previous dirty entries, but retain the clean one. // This feature is required for writes happening shortly // after deletes. - auto dirty_end = dirty_it; - dirty_end++; - while (1) - { - if (dirty_it == bs->dirty_db.begin()) - { - break; - } - dirty_it--; - if (dirty_it->first.oid != je->big_write.oid) - { - dirty_it++; - break; - } - } - auto clean_it = bs->clean_db.find(je->big_write.oid); - bs->erase_dirty( - dirty_it, dirty_end, - clean_it != bs->clean_db.end() ? clean_it->second.location : UINT64_MAX - ); - // Remove it from the flusher's queue, too - // Otherwise it may end up referring to a small unstable write after reading the rest of the journal - bs->flusher->remove_flush(je->big_write.oid); + erase_dirty_object(dirty_it); } } auto clean_it = bs->clean_db.find(je->big_write.oid); @@ -679,16 +657,26 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u #ifdef BLOCKSTORE_DEBUG printf("je_delete oid=%lx:%lx ver=%lu\n", je->del.oid.inode, je->del.oid.stripe, je->del.version); #endif + bool dirty_exists = false; auto dirty_it = bs->dirty_db.upper_bound((obj_ver_id){ .oid = je->del.oid, .version = UINT64_MAX, }); + if (dirty_it != bs->dirty_db.begin()) + { + dirty_it--; + dirty_exists = dirty_it->first.oid == je->del.oid; + } auto clean_it = bs->clean_db.find(je->del.oid); - // Ignore delete if neither preceding dirty entries nor the clean one are present - if ((clean_it != bs->clean_db.end() && - clean_it->second.version < je->del.version) || - (dirty_it != bs->dirty_db.begin() && - std::prev(dirty_it)->first.oid == je->del.oid)) + bool clean_exists = (clean_it != bs->clean_db.end() && + clean_it->second.version < je->del.version); + if (!clean_exists && dirty_exists) + { + // Clean entry doesn't exist. This means that the delete is already flushed. + // So we must not flush this object anymore. + erase_dirty_object(dirty_it); + } + else if (clean_exists || dirty_exists) { // oid, version obj_ver_id ov = { @@ -708,6 +696,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u // "2-phase commit" (write->stabilize) isn't sufficient for them anyway bs->mark_stable(ov); } + // Ignore delete if neither preceding dirty entries nor the clean one are present } started = true; pos += je->size; @@ -718,3 +707,27 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u bs->journal.next_free = next_free; return 1; } + +void blockstore_init_journal::erase_dirty_object(blockstore_dirty_db_t::iterator dirty_it) +{ + auto oid = dirty_it->first.oid; + auto dirty_end = dirty_it; + dirty_end++; + while (1) + { + if (dirty_it == bs->dirty_db.begin()) + { + break; + } + dirty_it--; + if (dirty_it->first.oid != oid) + { + dirty_it++; + break; + } + } + bs->erase_dirty(dirty_it, dirty_end, UINT64_MAX); + // Remove it from the flusher's queue, too + // Otherwise it may end up referring to a small unstable write after reading the rest of the journal + bs->flusher->remove_flush(oid); +} diff --git a/src/blockstore_init.h b/src/blockstore_init.h index a3d2d010..ad8f9970 100644 --- a/src/blockstore_init.h +++ b/src/blockstore_init.h @@ -48,6 +48,7 @@ class blockstore_init_journal std::function simple_callback; int handle_journal_part(void *buf, uint64_t done_pos, uint64_t len); void handle_event(ring_data_t *data); + void erase_dirty_object(blockstore_dirty_db_t::iterator dirty_it); public: blockstore_init_journal(blockstore_impl_t* bs); int loop();