From 571be0f380d0c303ee408cc2cfaf1d0c74d5e39b Mon Sep 17 00:00:00 2001 From: Vitaliy Filippov Date: Tue, 2 Jun 2020 20:43:28 +0300 Subject: [PATCH] Make deletions instantly stable "2-phase" (write->stabilize) process is pointless for deletions because it doesn't protect us from incomplete objects. This happens because it removes the version information from metadata after stabilization. Deletions require "3-phase" process with a potentially very long 3rd phase. So, deletions will be allowed to generate degraded and incomplete objects, and for it to not affect users' ability to delete something, the cluster will allow to delete whole inodes while storing a list of them in etcd. Proper TRIM will be impossible until the implementation of the aforementioned "3-phase" process, though. By the way, this change also fixes a possible write stall after rebalancing which was caused by the lack of "stabilize delete" operations. --- blockstore_init.cpp | 3 +++ blockstore_sync.cpp | 11 ++++++++++- blockstore_write.cpp | 5 +++++ osd.h | 2 +- osd_primary.cpp | 12 ++---------- 5 files changed, 21 insertions(+), 12 deletions(-) diff --git a/blockstore_init.cpp b/blockstore_init.cpp index 93b9ae0b..c3dbcc67 100644 --- a/blockstore_init.cpp +++ b/blockstore_init.cpp @@ -624,6 +624,9 @@ int blockstore_init_journal::handle_journal_part(void *buf, uint64_t done_pos, u .journal_sector = proc_pos, }); bs->journal.used_sectors[proc_pos]++; + // Deletions are treated as immediately stable, because + // "2-phase commit" (write->stabilize) isn't sufficient for them anyway + bs->mark_stable(ov); } } started = true; diff --git a/blockstore_sync.cpp b/blockstore_sync.cpp index 3f3899de..2e97660c 100644 --- a/blockstore_sync.cpp +++ b/blockstore_sync.cpp @@ -275,7 +275,16 @@ void blockstore_impl_t::ack_one_sync(blockstore_op_t *op) #endif auto & unstab = unstable_writes[it->oid]; unstab = unstab < it->version ? it->version : unstab; - dirty_db[*it].state = dirty_db[*it].state == ST_DEL_WRITTEN ? ST_DEL_SYNCED : ST_J_SYNCED; + if (dirty_db[*it].state == ST_DEL_WRITTEN) + { + dirty_db[*it].state = ST_DEL_SYNCED; + // Deletions are treated as immediately stable + mark_stable(*it); + } + else /* == ST_J_WRITTEN */ + { + dirty_db[*it].state = ST_J_SYNCED; + } } in_progress_syncs.erase(PRIV(op)->in_progress_ptr); op->retval = 0; diff --git a/blockstore_write.cpp b/blockstore_write.cpp index fcb41bef..b4f674cd 100644 --- a/blockstore_write.cpp +++ b/blockstore_write.cpp @@ -355,6 +355,11 @@ resume_4: else if (dirty_it->second.state == ST_DEL_SUBMITTED) { dirty_it->second.state = imm ? ST_DEL_SYNCED : ST_DEL_WRITTEN; + if (imm) + { + // Deletions are treated as immediately stable + mark_stable(dirty_it->first); + } } if (immediate_commit == IMMEDIATE_ALL) { diff --git a/osd.h b/osd.h index e52e5b6e..61a48999 100644 --- a/osd.h +++ b/osd.h @@ -193,7 +193,7 @@ class osd_t void continue_primary_del(osd_op_t *cur_op); bool check_write_queue(osd_op_t *cur_op, pg_t & pg); void remove_object_from_state(object_id & oid, pg_osd_set_state_t *object_state, pg_t &pg); - bool finalize_primary_write(osd_op_t *cur_op, pg_t & pg, pg_osd_set_t & loc_set, int base_state); + bool remember_unstable_write(osd_op_t *cur_op, pg_t & pg, pg_osd_set_t & loc_set, int base_state); void handle_primary_subop(osd_op_t *subop, osd_op_t *cur_op); void handle_primary_bs_subop(osd_op_t *subop); void add_bs_subop_stats(osd_op_t *subop); diff --git a/osd_primary.cpp b/osd_primary.cpp index a1663f4b..d00fa3e7 100644 --- a/osd_primary.cpp +++ b/osd_primary.cpp @@ -284,7 +284,7 @@ resume_9: // FIXME: Check for immediate_commit == IMMEDIATE_SMALL resume_6: resume_7: - if (!finalize_primary_write(cur_op, pg, pg.cur_loc_set, 6)) + if (!remember_unstable_write(cur_op, pg, pg.cur_loc_set, 6)) { return; } @@ -305,7 +305,7 @@ resume_7: } } -bool osd_t::finalize_primary_write(osd_op_t *cur_op, pg_t & pg, pg_osd_set_t & loc_set, int base_state) +bool osd_t::remember_unstable_write(osd_op_t *cur_op, pg_t & pg, pg_osd_set_t & loc_set, int base_state) { osd_primary_op_data_t *op_data = cur_op->op_data; if (op_data->st == base_state) @@ -598,8 +598,6 @@ void osd_t::continue_primary_del(osd_op_t *cur_op) else if (op_data->st == 3) goto resume_3; else if (op_data->st == 4) goto resume_4; else if (op_data->st == 5) goto resume_5; - else if (op_data->st == 6) goto resume_6; - else if (op_data->st == 7) goto resume_7; assert(op_data->st == 0); // Delete is forbidden even in active PGs if they're also degraded or have previous dead OSDs if (pg.state & (PG_DEGRADED | PG_LEFT_ON_DEAD)) @@ -641,12 +639,6 @@ resume_5: } // Remove version override pg.ver_override.erase(op_data->oid); -resume_6: -resume_7: - if (!finalize_primary_write(cur_op, pg, op_data->object_state ? op_data->object_state->osd_set : pg.cur_loc_set, 6)) - { - return; - } // Adjust PG stats after "instant stabilize", because we need object_state above if (!op_data->object_state) {