forked from vitalif/vitastor
Collapse create+delete journal entry pairs if they're already flushed
Old journal replay mechanism could lead to a double allocation of the same block and a "Fatal error: tried to overwrite non-zero metadata entry"rel-0.5
parent
843b7052d2
commit
f4769ba7c7
|
@ -582,32 +582,10 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
|
||||||
// its data and metadata are already flushed.
|
// its data and metadata are already flushed.
|
||||||
// We don't know if newer versions are flushed, but
|
// We don't know if newer versions are flushed, but
|
||||||
// the previous delete definitely is.
|
// the previous delete definitely is.
|
||||||
// So we flush previous dirty entries, but retain the clean one.
|
// So we forget previous dirty entries, but retain the clean one.
|
||||||
// This feature is required for writes happening shortly
|
// This feature is required for writes happening shortly
|
||||||
// after deletes.
|
// after deletes.
|
||||||
auto dirty_end = dirty_it;
|
erase_dirty_object(dirty_it);
|
||||||
dirty_end++;
|
|
||||||
while (1)
|
|
||||||
{
|
|
||||||
if (dirty_it == bs->dirty_db.begin())
|
|
||||||
{
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
dirty_it--;
|
|
||||||
if (dirty_it->first.oid != je->big_write.oid)
|
|
||||||
{
|
|
||||||
dirty_it++;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
auto clean_it = bs->clean_db.find(je->big_write.oid);
|
|
||||||
bs->erase_dirty(
|
|
||||||
dirty_it, dirty_end,
|
|
||||||
clean_it != bs->clean_db.end() ? clean_it->second.location : UINT64_MAX
|
|
||||||
);
|
|
||||||
// Remove it from the flusher's queue, too
|
|
||||||
// Otherwise it may end up referring to a small unstable write after reading the rest of the journal
|
|
||||||
bs->flusher->remove_flush(je->big_write.oid);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
auto clean_it = bs->clean_db.find(je->big_write.oid);
|
auto clean_it = bs->clean_db.find(je->big_write.oid);
|
||||||
|
@ -679,16 +657,26 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
|
||||||
#ifdef BLOCKSTORE_DEBUG
|
#ifdef BLOCKSTORE_DEBUG
|
||||||
printf("je_delete oid=%lx:%lx ver=%lu\n", je->del.oid.inode, je->del.oid.stripe, je->del.version);
|
printf("je_delete oid=%lx:%lx ver=%lu\n", je->del.oid.inode, je->del.oid.stripe, je->del.version);
|
||||||
#endif
|
#endif
|
||||||
|
bool dirty_exists = false;
|
||||||
auto dirty_it = bs->dirty_db.upper_bound((obj_ver_id){
|
auto dirty_it = bs->dirty_db.upper_bound((obj_ver_id){
|
||||||
.oid = je->del.oid,
|
.oid = je->del.oid,
|
||||||
.version = UINT64_MAX,
|
.version = UINT64_MAX,
|
||||||
});
|
});
|
||||||
|
if (dirty_it != bs->dirty_db.begin())
|
||||||
|
{
|
||||||
|
dirty_it--;
|
||||||
|
dirty_exists = dirty_it->first.oid == je->del.oid;
|
||||||
|
}
|
||||||
auto clean_it = bs->clean_db.find(je->del.oid);
|
auto clean_it = bs->clean_db.find(je->del.oid);
|
||||||
// Ignore delete if neither preceding dirty entries nor the clean one are present
|
bool clean_exists = (clean_it != bs->clean_db.end() &&
|
||||||
if ((clean_it != bs->clean_db.end() &&
|
clean_it->second.version < je->del.version);
|
||||||
clean_it->second.version < je->del.version) ||
|
if (!clean_exists && dirty_exists)
|
||||||
(dirty_it != bs->dirty_db.begin() &&
|
{
|
||||||
std::prev(dirty_it)->first.oid == je->del.oid))
|
// Clean entry doesn't exist. This means that the delete is already flushed.
|
||||||
|
// So we must not flush this object anymore.
|
||||||
|
erase_dirty_object(dirty_it);
|
||||||
|
}
|
||||||
|
else if (clean_exists || dirty_exists)
|
||||||
{
|
{
|
||||||
// oid, version
|
// oid, version
|
||||||
obj_ver_id ov = {
|
obj_ver_id ov = {
|
||||||
|
@ -708,6 +696,7 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
|
||||||
// "2-phase commit" (write->stabilize) isn't sufficient for them anyway
|
// "2-phase commit" (write->stabilize) isn't sufficient for them anyway
|
||||||
bs->mark_stable(ov);
|
bs->mark_stable(ov);
|
||||||
}
|
}
|
||||||
|
// Ignore delete if neither preceding dirty entries nor the clean one are present
|
||||||
}
|
}
|
||||||
started = true;
|
started = true;
|
||||||
pos += je->size;
|
pos += je->size;
|
||||||
|
@ -718,3 +707,27 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
|
||||||
bs->journal.next_free = next_free;
|
bs->journal.next_free = next_free;
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void blockstore_init_journal::erase_dirty_object(blockstore_dirty_db_t::iterator dirty_it)
|
||||||
|
{
|
||||||
|
auto oid = dirty_it->first.oid;
|
||||||
|
auto dirty_end = dirty_it;
|
||||||
|
dirty_end++;
|
||||||
|
while (1)
|
||||||
|
{
|
||||||
|
if (dirty_it == bs->dirty_db.begin())
|
||||||
|
{
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
dirty_it--;
|
||||||
|
if (dirty_it->first.oid != oid)
|
||||||
|
{
|
||||||
|
dirty_it++;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
bs->erase_dirty(dirty_it, dirty_end, UINT64_MAX);
|
||||||
|
// Remove it from the flusher's queue, too
|
||||||
|
// Otherwise it may end up referring to a small unstable write after reading the rest of the journal
|
||||||
|
bs->flusher->remove_flush(oid);
|
||||||
|
}
|
||||||
|
|
|
@ -48,6 +48,7 @@ class blockstore_init_journal
|
||||||
std::function<void(ring_data_t*)> simple_callback;
|
std::function<void(ring_data_t*)> simple_callback;
|
||||||
int handle_journal_part(void *buf, uint64_t done_pos, uint64_t len);
|
int handle_journal_part(void *buf, uint64_t done_pos, uint64_t len);
|
||||||
void handle_event(ring_data_t *data);
|
void handle_event(ring_data_t *data);
|
||||||
|
void erase_dirty_object(blockstore_dirty_db_t::iterator dirty_it);
|
||||||
public:
|
public:
|
||||||
blockstore_init_journal(blockstore_impl_t* bs);
|
blockstore_init_journal(blockstore_impl_t* bs);
|
||||||
int loop();
|
int loop();
|
||||||
|
|
Loading…
Reference in New Issue