Allow situations where the journal contains a big_write(v1) after delete(v2) and v1 < v2
Fixes a crash in the following scenario: - client issues a delete request (object version is at least 2) - OSD has time to flush it to the metadata, but doesn't have time to move the journal start pointer on disk - client overwrites the same object and it gets the version number 1 again - OSD is restarted and sees delete(v=2), big_write(v=1) in the journal - dirty_db sequence gets broken and OSD crashes with assert("Writes and deletes shouldn't happen at the same time")
parent
bcc8e697f9
commit
089f138e0c
|
@ -562,6 +562,45 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u
|
||||||
je->big_write.oid.inode, je->big_write.oid.stripe, je->big_write.version, je->big_write.location
|
je->big_write.oid.inode, je->big_write.oid.stripe, je->big_write.version, je->big_write.location
|
||||||
);
|
);
|
||||||
#endif
|
#endif
|
||||||
|
auto dirty_it = bs->dirty_db.upper_bound((obj_ver_id){
|
||||||
|
.oid = je->big_write.oid,
|
||||||
|
.version = UINT64_MAX,
|
||||||
|
});
|
||||||
|
if (dirty_it != bs->dirty_db.begin() && bs->dirty_db.size() > 0)
|
||||||
|
{
|
||||||
|
dirty_it--;
|
||||||
|
if (dirty_it->first.oid == je->big_write.oid &&
|
||||||
|
(dirty_it->second.state & BS_ST_TYPE_MASK) == BS_ST_DELETE)
|
||||||
|
{
|
||||||
|
// It is allowed to overwrite a deleted object with a
|
||||||
|
// version number less than deletion version number,
|
||||||
|
// because the presence of a BIG_WRITE entry means that
|
||||||
|
// the data for it is already on disk.
|
||||||
|
// Purge all dirty and clean entries for this object.
|
||||||
|
auto dirty_end = dirty_it;
|
||||||
|
dirty_end++;
|
||||||
|
while (1)
|
||||||
|
{
|
||||||
|
if (dirty_it == bs->dirty_db.begin())
|
||||||
|
{
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
dirty_it--;
|
||||||
|
if (dirty_it->first.oid != je->big_write.oid)
|
||||||
|
{
|
||||||
|
dirty_it++;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
bs->erase_dirty(dirty_it, dirty_end, UINT64_MAX);
|
||||||
|
auto clean_it = bs->clean_db.find(je->big_write.oid);
|
||||||
|
if (clean_it != bs->clean_db.end())
|
||||||
|
{
|
||||||
|
bs->data_alloc->set(clean_it->second.location >> bs->block_order, false);
|
||||||
|
bs->clean_db.erase(clean_it);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
auto clean_it = bs->clean_db.find(je->big_write.oid);
|
auto clean_it = bs->clean_db.find(je->big_write.oid);
|
||||||
if (clean_it == bs->clean_db.end() ||
|
if (clean_it == bs->clean_db.end() ||
|
||||||
clean_it->second.version < je->big_write.version)
|
clean_it->second.version < je->big_write.version)
|
||||||
|
|
|
@ -492,7 +492,10 @@ int blockstore_impl_t::dequeue_del(blockstore_op_t *op)
|
||||||
prepare_journal_sector_write(journal, journal.cur_sector, sqe, cb);
|
prepare_journal_sector_write(journal, journal.cur_sector, sqe, cb);
|
||||||
PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 1 + journal.cur_sector;
|
PRIV(op)->min_flushed_journal_sector = PRIV(op)->max_flushed_journal_sector = 1 + journal.cur_sector;
|
||||||
PRIV(op)->pending_ops++;
|
PRIV(op)->pending_ops++;
|
||||||
// Remember small write as unsynced
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
// Remember delete as unsynced
|
||||||
unsynced_small_writes.push_back((obj_ver_id){
|
unsynced_small_writes.push_back((obj_ver_id){
|
||||||
.oid = op->oid,
|
.oid = op->oid,
|
||||||
.version = op->version,
|
.version = op->version,
|
||||||
|
|
Loading…
Reference in New Issue